@Book{ herr.mart.ea:2000:cad, author = "J.R. Herrero and X. Martorell and J. Torres", title = "Conceptes Avançats de Sistemes Operatius", publisher = "{Edicions UPC: Edicions Virtuals}", year = "2000", address = "Barcelona, Spain", edition = "First", url = "http://personals.ac.upc.edu/josepr", isbn = {84-8301-439-4} } @InProceedings{ herr.nava:2003:abo, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {Automatic benchmarking and optimization of codes: an experience with numerical kernels}, booktitle = {Int. Conf. on Software Engineering Research and Practice}, fbooktitle = {Proceedings of the 2003 International Conference on Software Engineering Research and Practice}, year = {2003}, pages = {701--706}, month = jun, publisher = {CSREA Press}, isbn = {1-932415-20-3}, url = "http://personals.ac.upc.edu/josepr", location = {Las Vegas}, abstract = {New algorithms are constantly developed in search of better or faster results. Many variants of code are often tried while searching for the best solution. When the number of code variants or possible input parameters is very high, the process of benchmarking and analysis of results can become cumbersome and error prone. We present a tool for automatic benchmarking which manages a database of possible parameters and the results obtained for them. This tool can automatically choose the optimum code and create a target library. Using it we have generated a library specialized in the operation on very small matrices which is useful in multimedia codes. }, keywords = {Automatic benchmarking, tuning, performance, small matrices} } @InProceedings{ herr.nava:2003:bsv, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {Building Software via Shared Knowledge}, booktitle = {Int. Conf. on Software Engineering Research and Practice}, fbooktitle = {Proceedings of the 2003 International Conference on Software Engineering Research and Practice}, year = {2003}, pages = {861--867}, month = jun, publisher = {CSREA Press}, isbn = {1-932415-20-3}, url = "http://personals.ac.upc.edu/josepr", location = {Las Vegas} } @InProceedings{ herr.nava:2003:iph, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {{Improving Performance of Hypermatrix {Cholesky} Factorization}}, booktitle = {Euro-Par 2003,} # lncs # 2790, year = {2003}, pages = {461-469}, month = aug, publisher = springer, url = "http://personals.ac.upc.edu/josepr", location = {Klagenfurt, Austria}, abstract = {This paper shows how a sparse hypermatrix Cholesky factorization can be improved. This is accomplished by means of efficient codes which operate on very small dense matrices. Different matrix sizes or target platforms may require different codes to obtain good performance. We write a set of codes for each matrix operation using different loop orders and unroll factors. Then, for each matrix size, we automatically compile each code fixing matrix leading dimensions and loop sizes, run the resulting executable and keep its Mflops. The best combination is then used to produce the object introduced in a library. Thus, a routine for each desired matrix size is available from the library. The large overhead incurred by the hypermatrix Cholesky factorization of sparse matrices can therefore be lessened by reducing the block size when those routines are used. Using the routines, e.g. matrix multiplication, in our small matrix library produced important speed-ups in our sparse Cholesky code.}, keywords = {Hypermatrix Cholesky, inner kernels, small matrices} } @InProceedings{ herr.nava:2003:oss, author = {Jos\'e R. Herrero and David Benlliure}, title = {Operating System Support for Process Confinement}, booktitle = {Proceedings of the International Conference on Security and Management}, year = {2003}, pages = {421--425}, month = jun, publisher = {CSREA Press}, isbn = {1-932415-17-3}, url = "http://personals.ac.upc.edu/josepr", location = {Las Vegas}, abstract = {Execution of untrusted software can compromise a whole system. Tools for restricting access of software to system resources are essential for security maintenance. Operating systems should offer functionality for building tools which could run in user mode with no special privileges while providing full access control. Thus, they could be made available to any user in the system. In this paper we show ways of extending an operating system in order to provide such function- ality. We present the changes introduced in the Linux kernel to offer the minimum functionality necessary for building such a tool. Using the new functionality we were able to code a program for controlling execution of untrusted software through system call interposition.}, kwds = {Process confinement, system call interposition, user mode monitor.} } @InProceedings{ herr.nava:2004:osp, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {Optimization of a Statically Partitioned Hypermatrix Sparse {Cholesky} Factorization}, booktitle = {Workshop on state-of-the-art in scientific computing (PARA'04),} # lncs # {3732}, year = {2004}, pages = {798--807}, month = jun, publisher = springer, issn = {0302-9743}, isbn = {978-3-540-29067-4}, doi = {http://dx.doi.org/10.1007/11558958_96}, url = "http://personals.ac.upc.edu/josepr", location = {Lyngby, Denmark}, abstract = {The sparse Cholesky factorization of some large matrices can require a two dimensional partitioning of the matrix. The sparse hypermatrix storage scheme produces a recursive 2D partitioning of a sparse matrix. The subblocks are stored as dense matrices so BLAS3 routines can be used. However, since we are dealing with sparse matrices some zeros may be stored in those dense blocks. The overhead introduced by the operations on zeros can become large and considerably degrade performance. In this paper we present an improvement to our sequential in-core implementation of a sparse Cholesky factorization based on a hypermatrix storage structure. We compare its performance with several codes and analyze the results. }, keywords = {Sparse Cholesky - Hypermatrix structure - 2D partitioning - Windows in submatrices - Small Matrix Library} } @InProceedings{ herr.nava:2004:ros, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {Reducing Overhead in Sparse Hypermatrix {Cholesky} Factorization}, booktitle = {IFIP TC5 Workshop on High Performance Computational Science and Engineering (HPCSE), World Computer Congress}, year = {2004}, pages = {143--154}, month = aug, publisher = springer, issn = {1571-5736}, isbn = {978-0-387-24048-0}, url = {http://www.springer.com/east/home/new+%26+forthcoming+titles+%28default%29?SGWID=5-40356-22-36844470-0} , url2 = "http://personals.ac.upc.edu/josepr", location = {Toulouse, France}, abstract = {The sparse hypermatrix storage scheme produces a recursive 2D partitioning of a sparse matrix. Data subblocks are stored as dense matrices. Since we are dealing with sparse matrices some zeros can be stored in those dense blocks. The overhead introduced by the operations on zeros can become really large and considerably degrade performance. In this paper, we present several techniques for reducing the operations on zeros in a sparse hypermatrix Cholesky factorization. By associating a bit to each column within a data submatrix we create a bit vector. We can avoid computations when the bitwise AND of their bit vectors is null. By keeping information about the actual space within a data submatrix which stores non-zeros (dense window) we can reduce both storage and computation. }, keywords = {Sparse Hypermatrix Cholesky, bit vector, dense window} } @InProceedings{ herr.nava:2005:ala, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {Adapting Linear Algebra Codes to the Memory Hierarchy Using a Hypermatrix Scheme}, booktitle = {Int. Conf. on Parallel Processing and Applied Mathematics. LNCS 3911}, fbooktitle = {Proceedings of the International Conference on Parallel Processing and Applied Mathematics}, year = {2005}, pages = {1058--1065}, month = sep, issn = {0302-9743}, isbn = {978-3-540-34141-3}, doi = {http://dx.doi.org/10.1007/11752578_128}, url = "http://personals.ac.upc.edu/josepr", location = {Poznan, Poland}, abstract = {We present the way in which we adapt data and computations to the underlying memory hierarchy by means of a hierarchical data structure known as hypermatrix. The application of orthogonal block forms produced the best performance for the platforms used. }, keywords = {Non-canonical storage, small matrix library, inner kernels} } @InProceedings{ herr.nava:2005:ein, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {Efficient Implementation of Nearest Neighbor Classification}, booktitle = {Int. Conf. on Computer Recognition Systems (CORES). Advances in Soft Computing, Vol XVIII}, f-booktitle = {International Conference on Computer Recognition Systems (CORES). Advances in Soft Computing, Vol XVIII}, year = {2005}, pages = {177--186}, month = may, issn = {1615-3871}, issn2 = {1434-9922}, isbn = {978-3-540-25054-8}, url = "http://personals.ac.upc.edu/josepr", location = {Poland} } @InProceedings{ herr.nava:2005:ias, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {Intra-Block Amalgamation in Sparse Hypermatrix {Cholesky} Factorization}, booktitle = {Int. Conf. on Computational Science and Engineering}, fbooktitle = {Proceedings of the International Conference on Computational Science and Engineering}, year = {2005}, month = jun, pages = {15--22}, isbn = {975-561-266-1}, url = "http://personals.ac.upc.edu/josepr", location = {Istambul, Turkey}, abstract = {In this paper we present an improvement to our sequential in-core implementation of a sparse Cholesky factorization based on a hypermatrix storage structure. We allow the inclusion of additional zeros in data submatrices to create larger blocks and in this way use more efficient routines for matrix multiplication. Since matrix multiplication takes about 90\% of the total factorization time this is an important point to optimize. }, keywords = {Sparse Cholesky factorization, hypermatrix structure, intra-block amalgamation, small matrix library} } @InProceedings{ herr.nava:2005:sli, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {A Study on Load Imbalance in Parallel Hypermatrix Multiplication using {OpenMP}}, booktitle = {Int. Conf. on Parallel Processing and Applied Mathematics. LNCS 3911}, fbooktitle = {Proceedings of the International Conference on Parallel Processing and Applied Mathematics}, year = {2005}, pages = {124--131}, issn = {0302-9743}, isbn = {978-3-540-34141-3}, doi = {http://dx.doi.org/10.1007/11752578_16}, month = sep, url = "http://personals.ac.upc.edu/josepr", location = {Poznan, Poland} } @Misc{ herr.nava:2006:afa:mis, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {{ACME}: A framework for Accurate Measuments}, year = 2006, note = "http://personals.ac.upc.edu/josepr/Soft/ACME", url = "http://personals.ac.upc.edu/josepr/Soft/ACME" } @InProceedings{ herr.nava:2006:cke, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {Compiler-Optimized Kernels: An Efficient Alternative to Hand-Coded Inner Kernels}, booktitle = {Proceedings of the International Conference on Computational Science and its Applications (ICCSA). LNCS 3984}, year = {2006}, pages = {762--771}, month = may, issn = {0302-9743}, isbn = {978-3-540-34079-9}, doi = {http://dx.doi.org/10.1007/11751649_84}, url = "http://personals.ac.upc.edu/josepr", location = {Glasgow, UK}, abstract = {The use of highly optimized inner kernels is of paramount importance for obtaining efficient numerical algorithms. Often, such kernels are created by hand. In this paper, however, we present an alternative way to produce efficient matrix multiplication kernels based on a set of simple codes which can be parameterized at compilation time. Using the resulting kernels we have been able to produce high performance sparse and dense linear algebra codes on a variety of platforms.}, keywords = {Inner kernels, compiler optimization, matrix multiplication} } @InProceedings{ herr.nava:2006:fam, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {A Framework for Accurate Measurements with Low Resolution Clocks}, booktitle = {The 10th IASTED International Conference on Software Engineering and Applications (SEA 2006)}, year = {2006}, pages = {228--232}, month = nov, publisher = {ACTA Press}, isbn = {0-88986-642-2}, url = "http://personals.ac.upc.edu/josepr", location = {Dallas, Texas, USA} } @InProceedings{ herr.nava:2006:shc, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {Sparse Hypermatrix {Cholesky}: Customization for High Performance}, booktitle = {Proceedings of The International MultiConference of Engineers and Computer Scientists 2006}, year = {2006}, pages = {821--827}, month = jun, isbn = {978-988-98671-3-3}, url = "http://personals.ac.upc.edu/josepr", location = {Hong Kong}, abstract = {Efficient execution of numerical algorithms requires adapting the code to the underlying execution platform. In this paper we show the process of fine tuning our sparse Hypermatrix Cholesky factorization in order to exploit efficiently two important machine resources: processor and memory. Using the techniques we presented in previous papers we tune our code on a different platform. Then, we extend our work in two directions: first, we experiment with a variation of the ordering algorithm, and second, we reduce the data submatrix storage to be able to use larger submatrix sizes.}, keywords = {Sparse Cholesky factorization, hypermatrix structure, small matrix library} } @InProceedings{ herr.nava:2006:una, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {Using non-canonical array layouts in dense matrix operations}, booktitle = {Workshop on state-of-the-art in scientific computing (PARA'06),} # lncs # {4699}, year = {2006}, pages = {580--588}, doi = {10.1007/978-3-540-75755-9_70}, month = jun, publisher = springer, issn = {0302-9743}, isbn = {978-3-540-75754-2}, url = "http://personals.ac.upc.edu/josepr", location = {Ume\aa, Sweden}, abstract = {We present two implementations of dense matrix multiplication based on two different non-canonical array layouts: one based on a hypermatrix data structure (HM) where data submatrices are stored using a recursive layout; the other based on a simple block data layout with square blocks (SB) where blocks are arranged in column-major order. We show that the iterative code using SB outperforms a recursive code using HM and obtains competitive results on a variety of platforms.}, keywords = {Non-canonical array layouts, matrix multiplication, Square Block Format, Hypermatrix, Small Matrix Library (SML), inner kernels} } @InBook{ herr.nava:2007:aish, author = {Jos\'e R. Herrero and Juan J. Navarro}, editor = {S.I. Ao, A.M. Gracia, J. Lee and K. Yu}, title = {Advances in Sparse Hypermatrix {Cholesky} Factorization}, chapter = {2}, pages = {7--24}, publisher = {Newswood Limited}, year = {2007}, address = {Hong Kong}, isbn = {978-988-98671-1-9}, key = {}, booktitle = {Recent Advances in Engineering and Computer Science 2007}, kwds = {Sparse Hypermatrix Cholesky} } @Article{ herr.nava:2007:ash, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {Analysis of a Sparse Hypermatrix {Cholesky} with Fixed-Sized Blocking}, journal = {Applicable Algebra in Engineering, Communications and Computing}, year = {2007}, volume = {18}, number = {3}, pages = {279--295}, month = may, issn = {0938-1279 (Print) 1432-0622 (Online)}, doi = {10.1007/s00200-007-0039-8}, note = {DOI: 10.1007/s00200-007-0039-8}, publisher = {Springer}, link = {http://www.springerlink.com/content/q7xv34355818x314/}, url = "http://personals.ac.upc.edu/josepr", abstract = {We present the way in which we have constructed an implementation of a sparse Cholesky factorization based on a hypermatrix data structure. This data structure is a storage scheme which produces a recursive 2D partitioning of a sparse matrix. It can be useful on some large sparse matrices. Subblocks are stored as dense matrices. Thus, efficient BLAS3 routines can be used. However, since we are dealing with sparse matrices some zeros may be stored in those dense blocks. The overhead introduced by the operations on zeros can become large and considerably degrade performance. We present the ways in which we deal with this overhead. Using matrices from different areas (Interior Point Methods of linear programming and Finite Element Methods), we evaluate our sequential in-core hypermatrix sparse Cholesky implementation. We compare its performance with several other codes and analyze the results. In spite of using a simple fixed-size partitioning of the matrix our code obtains competitive performance.}, keywords = {Sparse Cholesky - Hypermatrix structure - 2D partitioning - Windows in submatrices - Small Matrix Library} } @Article{ herr.nava:2007:ecr, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {Exploiting computer resources for fast nearest neighbor classification}, journal = {Pattern Analysis \& Applications}, year = {2007}, volume = {10}, number = {4}, pages = {265--275}, month = oct, issn = {1433-7541 (Print) 1433-755X (Online)}, doi = {10.1007/s10044-007-0065-y}, publisher = {Springer}, link = {http://www.springerlink.com/content/n187u354832u06l5}, url = "http://personals.ac.upc.edu/josepr", abstract = {Modern computers provide excellent opportunities for performing fast computations. They are equipped with powerful microprocessors and large memories. However, programs are not necessarily able to exploit those computer resources effectively. In this paper, we present the way in which we have implemented a nearest neighbor classification. We show how performance can be improved by exploiting the ability of superscalar processors to issue multiple instructions per cycle and by using the memory hierarchy adequately. This is accomplished by the use of floating-point arithmetic which usually outperforms integer arithmetic, and block (tiled) algorithms which exploit the data locality of programs allowing for an efficient use of the data stored in the cache memory. Our results are validated with both an analytical model and empirical results. We show that regular codes could be performed faster than more complex irregular codes using standard data sets.} } @Article{ herr.nava:2007:shc, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {Sparse Hypermatrix {Cholesky}: Customization for High Performance}, journal = {IAENG International Journal of Applied Mathematics}, year = {2007}, volume = {36}, number = {1}, pages = {6--12}, month = mar, issn = {1992-9986 (online version); 1992-9978 (print version)}, publisher = {Newswood Limited}, link = {http://www.iaeng.org/IJAM/issues_v36/issue_1/index.html}, url = "http://personals.ac.upc.edu/josepr", abstract = {Efficient execution of numerical algorithms requires adapting the code to the underlying execution platform. In this paper we show the process of fine tuning our sparse Hypermatrix Cholesky factorization in order to exploit efficiently two important machine resources: processor and memory. Using the techniques we presented in previous papers we tune our code on a different platform. Then, we extend our work in two directions: first, we experiment with a variation of the ordering algorithm, and second, we reduce the data submatrix storage to be able to use larger submatrix sizes.}, keywords = {Sparse Cholesky factorization, hypermatrix structure, small matrix library} } @Article{ herr.nava:2008:hos, author = {Jos\'e R. Herrero and Juan J. Navarro}, title = {Hypermatrix Oriented Supernode Amalgamation}, journal = {The Journal of Supercomputing}, year = {2008}, volume = {46}, number = {1}, pages = {84--104}, month = oct, issn = {0920-8542 (Print) 1573-0484 (Online)}, doi = {10.1007/s11227-008-0188-y}, publisher = {Springer}, link = {http://www.springerlink.com/}, url = "http://personals.ac.upc.edu/josepr", abstract = {In this paper, we introduce a supernode amalgamation algorithm which takes into account the characteristics of a hypermatrix data structure. The resulting frontal tree is then used to create a variable-sized partitioning of the hypermatrix. The sparse hypermatrix Cholesky factorization obtained runs slightly faster than the one which uses a fixed-sized partitioning. The algorithm also reduces data dependencies which limit exploitation of parallelism}, keywords = {Sparse Cholesky factorization - Hypermatrix - Supernode amalgamation - Variable-sized partitioning} } @PhDThesis{ herr:2006:phd, author = {Jos\'e R. Herrero}, title = {A Framework for Efficient Execution of Matrix Computations}, school = {Computer Architecture Department, Universitat Polit\`ecnica de Catalunya}, year = 2006, month = jul, url = "http://personals.ac.upc.edu/josepr", depositolegal = {B.52668-2006}, isbn = {84-690-2494-9} } @InProceedings{ herr:2007:nds, author = {Jos\'e R. Herrero}, title = {New Data Structures for Matrices and Specialized Inner Kernels: Low overhead for High Performance}, booktitle = {Int. Conf. on Parallel Processing and Applied Mathematics (PPAM'07)}, year = {2007}, series = "Lecture Notes in Computer Science", volume = {4967}, pages = {659--667}, month = sep, publisher = springer, annote = {Parallel Processing and Applied Mathematics} # lncs # {4967}, booktitle-s = "PPAM'07", fbooktitle = {Proceedings of the International Conference on Parallel Processing and Applied Mathematics}, issn = {0302-9743}, isbn = {978-3-540-68105-2}, doi = {10.1007/978-3-540-68111-3_69}, url = "http://personals.ac.upc.edu/josepr", location = {Gdansk, Poland}, abstract = {Dense linear algebra codes are often expressed and coded in terms of BLAS calls. This approach, however, achieves suboptimal performance due to the overheads associated to such calls. Taking as an example the dense Cholesky factorization of a symmetric positive definite matrix we show that the potential of non-canonical data structures for dense linear algebra can be better exploited with the use of specialized inner kernels. The use of non-canonical data structures together with specialized inner kernels has low overhead and can produce excellent performance.}, keywords = {Specialized inner kernels, new data structures, dense linear algebra, low overhead, high performance.} } @InProceedings{ nava.garc.ea:1996:dpm, author = {Juan J. Navarro and E. Garc\'{\i}a and Jos\'e R. Herrero}, title = {Data Prefetching and Multilevel Blocking for Linear Algebra Operations}, booktitle = {Proceedings of the 10th international conference on Supercomputing}, year = {1996}, pages = {109--116}, month = may, publisher = {ACM Press}, doi = {http://doi.acm.org/10.1145/237578.237592}, isbn = {0-89791-803-7}, url = "http://personals.ac.upc.edu/josepr", location = {Philadelphia, USA} }