@Book{ herr.mart.ea:2000:cad,
author = "J.R. Herrero and X. Martorell and J. Torres",
title = "Conceptes Avançats de Sistemes Operatius",
publisher = "{Edicions UPC: Edicions Virtuals}",
year = "2000",
address = "Barcelona, Spain",
edition = "First",
url = "http://personals.ac.upc.edu/josepr",
isbn = {84-8301-439-4}
}
@InProceedings{ herr.nava:2003:abo,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {Automatic benchmarking and optimization of codes: an
experience with numerical kernels},
booktitle = {Int. Conf. on Software Engineering Research and Practice},
fbooktitle = {Proceedings of the 2003 International Conference on
Software Engineering Research and Practice},
year = {2003},
pages = {701--706},
month = jun,
publisher = {CSREA Press},
isbn = {1-932415-20-3},
url = "http://personals.ac.upc.edu/josepr",
location = {Las Vegas},
abstract = {New algorithms are constantly developed in
search of better or faster results. Many variants of code
are often tried while searching for the best solution. When
the number of code variants or possible input parameters is
very high, the process of benchmarking and analysis of
results can become cumbersome and error prone. We present a
tool for automatic benchmarking which manages a database of
possible parameters and the results obtained for them. This
tool can automatically choose the optimum code and create a
target library. Using it we have generated a library
specialized in the operation on very small matrices which
is useful in multimedia codes. },
keywords = {Automatic benchmarking, tuning, performance, small
matrices}
}
@InProceedings{ herr.nava:2003:bsv,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {Building Software via Shared Knowledge},
booktitle = {Int. Conf. on Software Engineering Research and Practice},
fbooktitle = {Proceedings of the 2003 International Conference on
Software Engineering Research and Practice},
year = {2003},
pages = {861--867},
month = jun,
publisher = {CSREA Press},
isbn = {1-932415-20-3},
url = "http://personals.ac.upc.edu/josepr",
location = {Las Vegas}
}
@InProceedings{ herr.nava:2003:iph,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {{Improving Performance of Hypermatrix {Cholesky}
Factorization}},
booktitle = {Euro-Par 2003,} # lncs # 2790,
year = {2003},
pages = {461-469},
month = aug,
publisher = springer,
url = "http://personals.ac.upc.edu/josepr",
location = {Klagenfurt, Austria},
abstract = {This paper shows how a sparse hypermatrix Cholesky
factorization can be improved. This is accomplished by
means of efficient codes which operate on very small dense
matrices. Different matrix sizes or target platforms may
require different codes to obtain good performance. We
write a set of codes for each matrix operation using
different loop orders and unroll factors. Then, for each
matrix size, we automatically compile each code fixing
matrix leading dimensions and loop sizes, run the resulting
executable and keep its Mflops. The best combination is
then used to produce the object introduced in a library.
Thus, a routine for each desired matrix size is available
from the library. The large overhead incurred by the
hypermatrix Cholesky factorization of sparse matrices can
therefore be lessened by reducing the block size when those
routines are used. Using the routines, e.g. matrix
multiplication, in our small matrix library produced
important speed-ups in our sparse Cholesky code.},
keywords = {Hypermatrix Cholesky, inner kernels, small matrices}
}
@InProceedings{ herr.nava:2003:oss,
author = {Jos\'e R. Herrero and David Benlliure},
title = {Operating System Support for Process Confinement},
booktitle = {Proceedings of the International Conference on Security
and Management},
year = {2003},
pages = {421--425},
month = jun,
publisher = {CSREA Press},
isbn = {1-932415-17-3},
url = "http://personals.ac.upc.edu/josepr",
location = {Las Vegas},
abstract = {Execution of untrusted software can compromise a whole
system. Tools for restricting access of software to system
resources are essential for security maintenance. Operating
systems should offer functionality for building tools which
could run in user mode with no special privileges while
providing full access control. Thus, they could be made
available to any user in the system. In this paper we show
ways of extending an operating system in order to provide
such function- ality. We present the changes introduced in
the Linux kernel to offer the minimum functionality
necessary for building such a tool. Using the new
functionality we were able to code a program for
controlling execution of untrusted software through system
call interposition.},
kwds = {Process confinement, system call interposition, user
mode monitor.}
}
@InProceedings{ herr.nava:2004:osp,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {Optimization of a Statically Partitioned Hypermatrix
Sparse {Cholesky} Factorization},
booktitle = {Workshop on state-of-the-art in scientific computing
(PARA'04),} # lncs # {3732},
year = {2004},
pages = {798--807},
month = jun,
publisher = springer,
issn = {0302-9743},
isbn = {978-3-540-29067-4},
doi = {http://dx.doi.org/10.1007/11558958_96},
url = "http://personals.ac.upc.edu/josepr",
location = {Lyngby, Denmark},
abstract = {The sparse Cholesky factorization of some large matrices
can require a two dimensional partitioning of the matrix.
The sparse hypermatrix storage scheme produces a recursive
2D partitioning of a sparse matrix. The subblocks are
stored as dense matrices so BLAS3 routines can be used.
However, since we are dealing with sparse matrices some
zeros may be stored in those dense blocks. The overhead
introduced by the operations on zeros can become large and
considerably degrade performance. In this paper we present
an improvement to our sequential in-core implementation of
a sparse Cholesky factorization based on a hypermatrix
storage structure. We compare its performance with several
codes and analyze the results. },
keywords = {Sparse Cholesky - Hypermatrix structure - 2D partitioning
- Windows in submatrices - Small Matrix Library}
}
@InProceedings{ herr.nava:2004:ros,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {Reducing Overhead in Sparse Hypermatrix {Cholesky}
Factorization},
booktitle = {IFIP TC5 Workshop on High Performance Computational
Science and Engineering (HPCSE), World Computer Congress},
year = {2004},
pages = {143--154},
month = aug,
publisher = springer,
issn = {1571-5736},
isbn = {978-0-387-24048-0},
url = {http://www.springer.com/east/home/new+%26+forthcoming+titles+%28default%29?SGWID=5-40356-22-36844470-0}
,
url2 = "http://personals.ac.upc.edu/josepr",
location = {Toulouse, France},
abstract = {The sparse hypermatrix storage scheme produces a recursive
2D partitioning of a sparse matrix. Data subblocks are
stored as dense matrices. Since we are dealing with sparse
matrices some zeros can be stored in those dense blocks.
The overhead introduced by the operations on zeros can
become really large and considerably degrade performance.
In this paper, we present several techniques for reducing
the operations on zeros in a sparse hypermatrix Cholesky
factorization. By associating a bit to each column within a
data submatrix we create a bit vector. We can avoid
computations when the bitwise AND of their bit vectors is
null. By keeping information about the actual space within
a data submatrix which stores non-zeros (dense window) we
can reduce both storage and computation. },
keywords = {Sparse Hypermatrix Cholesky, bit vector, dense window}
}
@InProceedings{ herr.nava:2005:ala,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {Adapting Linear Algebra Codes to the Memory Hierarchy
Using a Hypermatrix Scheme},
booktitle = {Int. Conf. on Parallel Processing and Applied Mathematics.
LNCS 3911},
fbooktitle = {Proceedings of the International Conference on Parallel
Processing and Applied Mathematics},
year = {2005},
pages = {1058--1065},
month = sep,
issn = {0302-9743},
isbn = {978-3-540-34141-3},
doi = {http://dx.doi.org/10.1007/11752578_128},
url = "http://personals.ac.upc.edu/josepr",
location = {Poznan, Poland},
abstract = {We present the way in which we adapt data and computations
to the underlying memory hierarchy by means of a
hierarchical data structure known as hypermatrix. The
application of orthogonal block forms produced the best
performance for the platforms used. },
keywords = {Non-canonical storage, small matrix library, inner
kernels}
}
@InProceedings{ herr.nava:2005:ein,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {Efficient Implementation of Nearest Neighbor
Classification},
booktitle = {Int. Conf. on Computer Recognition Systems (CORES).
Advances in Soft Computing, Vol XVIII},
f-booktitle = {International Conference on Computer Recognition Systems
(CORES). Advances in Soft Computing, Vol XVIII},
year = {2005},
pages = {177--186},
month = may,
issn = {1615-3871},
issn2 = {1434-9922},
isbn = {978-3-540-25054-8},
url = "http://personals.ac.upc.edu/josepr",
location = {Poland}
}
@InProceedings{ herr.nava:2005:ias,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {Intra-Block Amalgamation in Sparse Hypermatrix {Cholesky}
Factorization},
booktitle = {Int. Conf. on Computational Science and Engineering},
fbooktitle = {Proceedings of the International Conference on
Computational Science and Engineering},
year = {2005},
month = jun,
pages = {15--22},
isbn = {975-561-266-1},
url = "http://personals.ac.upc.edu/josepr",
location = {Istambul, Turkey},
abstract = {In this paper we present an improvement to our sequential
in-core implementation of a sparse Cholesky factorization
based on a hypermatrix storage structure. We allow the
inclusion of additional zeros in data submatrices to create
larger blocks and in this way use more efficient routines
for matrix multiplication. Since matrix multiplication
takes about 90\% of the total factorization time this is an
important point to optimize. },
keywords = {Sparse Cholesky factorization, hypermatrix structure,
intra-block amalgamation, small matrix library}
}
@InProceedings{ herr.nava:2005:sli,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {A Study on Load Imbalance in Parallel Hypermatrix
Multiplication using {OpenMP}},
booktitle = {Int. Conf. on Parallel Processing and Applied Mathematics.
LNCS 3911},
fbooktitle = {Proceedings of the International Conference on Parallel
Processing and Applied Mathematics},
year = {2005},
pages = {124--131},
issn = {0302-9743},
isbn = {978-3-540-34141-3},
doi = {http://dx.doi.org/10.1007/11752578_16},
month = sep,
url = "http://personals.ac.upc.edu/josepr",
location = {Poznan, Poland}
}
@Misc{ herr.nava:2006:afa:mis,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {{ACME}: A framework for Accurate Measuments},
year = 2006,
note = "http://personals.ac.upc.edu/josepr/Soft/ACME",
url = "http://personals.ac.upc.edu/josepr/Soft/ACME"
}
@InProceedings{ herr.nava:2006:cke,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {Compiler-Optimized Kernels: An Efficient Alternative to
Hand-Coded Inner Kernels},
booktitle = {Proceedings of the International Conference on
Computational Science and its Applications (ICCSA). LNCS
3984},
year = {2006},
pages = {762--771},
month = may,
issn = {0302-9743},
isbn = {978-3-540-34079-9},
doi = {http://dx.doi.org/10.1007/11751649_84},
url = "http://personals.ac.upc.edu/josepr",
location = {Glasgow, UK},
abstract = {The use of highly optimized inner kernels is of paramount
importance for obtaining efficient numerical algorithms.
Often, such kernels are created by hand. In this paper,
however, we present an alternative way to produce efficient
matrix multiplication kernels based on a set of simple
codes which can be parameterized at compilation time. Using
the resulting kernels we have been able to produce high
performance sparse and dense linear algebra codes on a
variety of platforms.},
keywords = {Inner kernels, compiler optimization, matrix
multiplication}
}
@InProceedings{ herr.nava:2006:fam,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {A Framework for Accurate Measurements with Low Resolution
Clocks},
booktitle = {The 10th IASTED International Conference on Software
Engineering and Applications (SEA 2006)},
year = {2006},
pages = {228--232},
month = nov,
publisher = {ACTA Press},
isbn = {0-88986-642-2},
url = "http://personals.ac.upc.edu/josepr",
location = {Dallas, Texas, USA}
}
@InProceedings{ herr.nava:2006:shc,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {Sparse Hypermatrix {Cholesky}: Customization for High
Performance},
booktitle = {Proceedings of The International MultiConference of
Engineers and Computer Scientists 2006},
year = {2006},
pages = {821--827},
month = jun,
isbn = {978-988-98671-3-3},
url = "http://personals.ac.upc.edu/josepr",
location = {Hong Kong},
abstract = {Efficient execution of numerical algorithms requires
adapting the code to the underlying execution platform. In
this paper we show the process of fine tuning our sparse
Hypermatrix Cholesky factorization in order to exploit
efficiently two important machine resources: processor and
memory. Using the techniques we presented in previous
papers we tune our code on a different platform. Then, we
extend our work in two directions: first, we experiment
with a variation of the ordering algorithm, and second, we
reduce the data submatrix storage to be able to use larger
submatrix sizes.},
keywords = {Sparse Cholesky factorization, hypermatrix structure,
small matrix library}
}
@InProceedings{ herr.nava:2006:una,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {Using non-canonical array layouts in dense matrix
operations},
booktitle = {Workshop on state-of-the-art in scientific computing
(PARA'06),} # lncs # {4699},
year = {2006},
pages = {580--588},
doi = {10.1007/978-3-540-75755-9_70},
month = jun,
publisher = springer,
issn = {0302-9743},
isbn = {978-3-540-75754-2},
url = "http://personals.ac.upc.edu/josepr",
location = {Ume\aa, Sweden},
abstract = {We present two implementations of dense matrix
multiplication based on two different non-canonical
array layouts: one based on a hypermatrix data
structure (HM) where data submatrices are stored using
a recursive layout; the other based on a simple block
data layout with square blocks (SB) where blocks are
arranged in column-major order. We show that the
iterative code using SB outperforms a recursive code
using HM and obtains competitive results on a variety
of platforms.},
keywords = {Non-canonical array layouts, matrix multiplication,
Square Block Format, Hypermatrix, Small Matrix Library (SML),
inner kernels}
}
@InBook{ herr.nava:2007:aish,
author = {Jos\'e R. Herrero and Juan J. Navarro},
editor = {S.I. Ao, A.M. Gracia, J. Lee and K. Yu},
title = {Advances in Sparse Hypermatrix {Cholesky} Factorization},
chapter = {2},
pages = {7--24},
publisher = {Newswood Limited},
year = {2007},
address = {Hong Kong},
isbn = {978-988-98671-1-9},
key = {},
booktitle = {Recent Advances in Engineering and Computer Science 2007},
kwds = {Sparse Hypermatrix Cholesky}
}
@Article{ herr.nava:2007:ash,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {Analysis of a Sparse Hypermatrix {Cholesky} with
Fixed-Sized Blocking},
journal = {Applicable Algebra in Engineering, Communications and
Computing},
year = {2007},
volume = {18},
number = {3},
pages = {279--295},
month = may,
issn = {0938-1279 (Print) 1432-0622 (Online)},
doi = {10.1007/s00200-007-0039-8},
note = {DOI: 10.1007/s00200-007-0039-8},
publisher = {Springer},
link = {http://www.springerlink.com/content/q7xv34355818x314/},
url = "http://personals.ac.upc.edu/josepr",
abstract = {We present the way in which we have constructed an
implementation of a sparse Cholesky factorization based on
a hypermatrix data structure. This data structure is a
storage scheme which produces a recursive 2D partitioning
of a sparse matrix. It can be useful on some large sparse
matrices. Subblocks are stored as dense matrices. Thus,
efficient BLAS3 routines can be used. However, since we are
dealing with sparse matrices some zeros may be stored in
those dense blocks. The overhead introduced by the
operations on zeros can become large and considerably
degrade performance. We present the ways in which we deal
with this overhead. Using matrices from different areas
(Interior Point Methods of linear programming and Finite
Element Methods), we evaluate our sequential in-core
hypermatrix sparse Cholesky implementation. We compare its
performance with several other codes and analyze the
results. In spite of using a simple fixed-size partitioning
of the matrix our code obtains competitive performance.},
keywords = {Sparse Cholesky - Hypermatrix structure - 2D partitioning
- Windows in submatrices - Small Matrix Library}
}
@Article{ herr.nava:2007:ecr,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {Exploiting computer resources for fast nearest neighbor
classification},
journal = {Pattern Analysis \& Applications},
year = {2007},
volume = {10},
number = {4},
pages = {265--275},
month = oct,
issn = {1433-7541 (Print) 1433-755X (Online)},
doi = {10.1007/s10044-007-0065-y},
publisher = {Springer},
link = {http://www.springerlink.com/content/n187u354832u06l5},
url = "http://personals.ac.upc.edu/josepr",
abstract = {Modern computers provide excellent opportunities for
performing fast computations. They are equipped with
powerful microprocessors and large memories. However,
programs are not necessarily able to exploit those
computer resources effectively. In this paper, we
present the way in which we have implemented a nearest
neighbor classification. We show how performance can
be improved by exploiting the ability of superscalar
processors to issue multiple instructions per cycle
and by using the memory hierarchy adequately. This is
accomplished by the use of floating-point arithmetic
which usually outperforms integer arithmetic, and
block (tiled) algorithms which exploit the data
locality of programs allowing for an efficient use of
the data stored in the cache memory. Our results are
validated with both an analytical model and empirical
results. We show that regular codes could be performed
faster than more complex irregular codes using
standard data sets.}
}
@Article{ herr.nava:2007:shc,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {Sparse Hypermatrix {Cholesky}: Customization for High
Performance},
journal = {IAENG International Journal of Applied Mathematics},
year = {2007},
volume = {36},
number = {1},
pages = {6--12},
month = mar,
issn = {1992-9986 (online version); 1992-9978 (print version)},
publisher = {Newswood Limited},
link = {http://www.iaeng.org/IJAM/issues_v36/issue_1/index.html},
url = "http://personals.ac.upc.edu/josepr",
abstract = {Efficient execution of numerical algorithms requires
adapting the code to the underlying execution platform. In
this paper we show the process of fine tuning our sparse
Hypermatrix Cholesky factorization in order to exploit
efficiently two important machine resources: processor and
memory. Using the techniques we presented in previous
papers we tune our code on a different platform. Then, we
extend our work in two directions: first, we experiment
with a variation of the ordering algorithm, and second, we
reduce the data submatrix storage to be able to use larger
submatrix sizes.},
keywords = {Sparse Cholesky factorization, hypermatrix structure,
small matrix library}
}
@Article{ herr.nava:2008:hos,
author = {Jos\'e R. Herrero and Juan J. Navarro},
title = {Hypermatrix Oriented Supernode Amalgamation},
journal = {The Journal of Supercomputing},
year = {2008},
volume = {46},
number = {1},
pages = {84--104},
month = oct,
issn = {0920-8542 (Print) 1573-0484 (Online)},
doi = {10.1007/s11227-008-0188-y},
publisher = {Springer},
link = {http://www.springerlink.com/},
url = "http://personals.ac.upc.edu/josepr",
abstract = {In this paper, we introduce a supernode amalgamation
algorithm which takes into account the characteristics of a
hypermatrix data structure. The resulting frontal tree is
then used to create a variable-sized partitioning of the
hypermatrix. The sparse hypermatrix Cholesky factorization
obtained runs slightly faster than the one which uses a
fixed-sized partitioning. The algorithm also reduces data
dependencies which limit exploitation of parallelism},
keywords = {Sparse Cholesky factorization - Hypermatrix - Supernode
amalgamation - Variable-sized partitioning}
}
@PhDThesis{ herr:2006:phd,
author = {Jos\'e R. Herrero},
title = {A Framework for Efficient Execution of Matrix
Computations},
school = {Computer Architecture Department, Universitat
Polit\`ecnica de Catalunya},
year = 2006,
month = jul,
url = "http://personals.ac.upc.edu/josepr",
depositolegal = {B.52668-2006},
isbn = {84-690-2494-9}
}
@InProceedings{ herr:2007:nds,
author = {Jos\'e R. Herrero},
title = {New Data Structures for Matrices and Specialized Inner
Kernels: Low overhead for High Performance},
booktitle = {Int. Conf. on Parallel Processing and Applied Mathematics
(PPAM'07)},
year = {2007},
series = "Lecture Notes in Computer Science",
volume = {4967},
pages = {659--667},
month = sep,
publisher = springer,
annote = {Parallel Processing and Applied Mathematics} # lncs
# {4967},
booktitle-s = "PPAM'07",
fbooktitle = {Proceedings of the International Conference on Parallel
Processing and Applied Mathematics},
issn = {0302-9743},
isbn = {978-3-540-68105-2},
doi = {10.1007/978-3-540-68111-3_69},
url = "http://personals.ac.upc.edu/josepr",
location = {Gdansk, Poland},
abstract = {Dense linear algebra codes are often expressed and coded
in terms of BLAS calls. This approach, however, achieves
suboptimal performance due to the overheads associated to
such calls.
Taking as an example the dense Cholesky factorization of a
symmetric positive definite matrix we show that the
potential of non-canonical data structures for dense linear
algebra can be better exploited with the use of specialized
inner kernels. The use of non-canonical data structures
together with specialized inner kernels has low overhead
and can produce excellent performance.},
keywords = {Specialized inner kernels, new data structures, dense
linear algebra, low overhead, high performance.}
}
@InProceedings{ nava.garc.ea:1996:dpm,
author = {Juan J. Navarro and E. Garc\'{\i}a and Jos\'e R. Herrero},
title = {Data Prefetching and Multilevel Blocking for Linear
Algebra Operations},
booktitle = {Proceedings of the 10th international conference on
Supercomputing},
year = {1996},
pages = {109--116},
month = may,
publisher = {ACM Press},
doi = {http://doi.acm.org/10.1145/237578.237592},
isbn = {0-89791-803-7},
url = "http://personals.ac.upc.edu/josepr",
location = {Philadelphia, USA}
}
@article{GrauSánchez2012,
title = "On new computational local orders of convergence",
journal = "Applied Mathematics Letters",
volume = "",
number = "0",
pages = " - ",
year = "2012",
note = "",
issn = "0893-9659",
doi = "10.1016/j.aml.2012.04.012",
url = "http://www.sciencedirect.com/science/article/pii/S0893965912002285",
author = "Miquel Grau-Sánchez and Miquel Noguera and Ŕngela Grau and José R. Herrero",
keywords = "Order of convergence",
keywords = "Nonlinear equations",
keywords = "Iterative methods"
}
@inproceedings{DBLP:conf/ppam/GustavsonWH11,
author = {Fred G. Gustavson and
Jerzy Wasniewski and
Jos{\'e} R. Herrero},
title = {New Level-3 BLAS Kernels for Cholesky Factorization},
booktitle = {PPAM (1)},
year = {2011},
pages = {60-69},
ee = {http://dx.doi.org/10.1007/978-3-642-31464-3_7},
crossref = {DBLP:conf/ppam/2011-1},
bibsource = {DBLP, http://dblp.uni-trier.de}
}
@proceedings{DBLP:conf/ppam/2011-1,
editor = {Roman Wyrzykowski and
Jack Dongarra and
Konrad Karczewski and
Jerzy Wasniewski},
title = {Parallel Processing and Applied Mathematics - 9th International
Conference, PPAM 2011, Torun, Poland, September 11-14, 2011.
Revised Selected Papers, Part I},
booktitle = {PPAM (1)},
publisher = {Springer},
series = {Lecture Notes in Computer Science},
volume = {7203},
year = {2012},
isbn = {978-3-642-31463-6},
ee = {http://dx.doi.org/10.1007/978-3-642-31464-3},
bibsource = {DBLP, http://dblp.uni-trier.de}
}
@article{DBLP:journals/concurrency/HerreroQS11,
author = {Jos{\'e} R. Herrero and
Enrique S. Quintana-Ort\'{\i} and
Robert Strzodka},
title = {Special Issue: GPU computing},
journal = {Concurrency and Computation: Practice and Experience},
volume = {23},
number = {7},
year = {2011},
pages = {667-668},
ee = {http://dx.doi.org/10.1002/cpe.1691},
bibsource = {DBLP, http://dblp.uni-trier.de}
}