Subversion Repositories OpenSees

Compare Revisions

Ignore whitespace Rev 4518 → Rev 4519

/trunk/OTHER/SuperLU_DIST_2.5/SRC/pddistribute.c
New file
0,0 → 1,1058
 
 
/*! @file
* \brief Re-distribute A on the 2D process mesh.
* <pre>
* -- Distributed SuperLU routine (version 2.3) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 15, 2008
* </pre>
*/
 
#include "superlu_ddefs.h"
 
 
/*! \brief
*
* <pre>
* Purpose
* =======
* Re-distribute A on the 2D process mesh.
*
* Arguments
* =========
*
* A (input) SuperMatrix*
* The distributed input matrix A of dimension (A->nrow, A->ncol).
* A may be overwritten by diag(R)*A*diag(C)*Pc^T.
* The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
*
* ScalePermstruct (input) ScalePermstruct_t*
* The data structure to store the scaling and permutation vectors
* describing the transformations performed to the original matrix A.
*
* Glu_freeable (input) *Glu_freeable_t
* The global structure describing the graph of L and U.
*
* grid (input) gridinfo_t*
* The 2D process mesh.
*
* colptr (output) int*
*
* rowind (output) int*
*
* a (output) double*
*
* Return value
* ============
* </pre>
*/
int_t
dReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno,
gridinfo_t *grid, int_t *colptr[], int_t *rowind[],
double *a[])
{
NRformat_loc *Astore;
int_t *perm_r; /* row permutation vector */
int_t *perm_c; /* column permutation vector */
int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize;
int_t nnz_loc; /* number of local nonzeros */
int_t nnz_remote; /* number of remote nonzeros to be sent */
int_t SendCnt; /* number of remote nonzeros to be sent */
int_t RecvCnt; /* number of remote nonzeros to be sent */
int_t *nnzToSend, *nnzToRecv, maxnnzToRecv;
int_t *ia, *ja, **ia_send, *index, *itemp;
int_t *ptr_to_send;
double *aij, **aij_send, *nzval, *dtemp;
double *nzval_a;
int iam, it, p, procs;
MPI_Request *send_req;
MPI_Status status;
 
/* ------------------------------------------------------------
INITIALIZATION.
------------------------------------------------------------*/
iam = grid->iam;
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Enter dReDistribute_A()");
#endif
perm_r = ScalePermstruct->perm_r;
perm_c = ScalePermstruct->perm_c;
procs = grid->nprow * grid->npcol;
Astore = (NRformat_loc *) A->Store;
n = A->ncol;
m_loc = Astore->m_loc;
fst_row = Astore->fst_row;
nnzToRecv = intCalloc_dist(2*procs);
nnzToSend = nnzToRecv + procs;
 
 
/* ------------------------------------------------------------
COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
THEN ALLOCATE SPACE.
THIS ACCOUNTS FOR THE FIRST PASS OF A.
------------------------------------------------------------*/
for (i = 0; i < m_loc; ++i) {
for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */
jcol = Astore->colind[j];
gbi = BlockNum( irow );
gbj = BlockNum( jcol );
p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
++nnzToSend[p];
}
}
 
/* All-to-all communication */
MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
grid->comm);
 
maxnnzToRecv = 0;
nnz_loc = SendCnt = RecvCnt = 0;
 
for (p = 0; p < procs; ++p) {
if ( p != iam ) {
SendCnt += nnzToSend[p];
RecvCnt += nnzToRecv[p];
maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
} else {
nnz_loc += nnzToRecv[p];
/*assert(nnzToSend[p] == nnzToRecv[p]);*/
}
}
k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
 
/* Allocate space for storing the triplets after redistribution. */
if ( k ) { /* count can be zero. */
if ( !(ia = intMalloc_dist(2*k)) )
ABORT("Malloc fails for ia[].");
if ( !(aij = doubleMalloc_dist(k)) )
ABORT("Malloc fails for aij[].");
}
ja = ia + k;
 
/* Allocate temporary storage for sending/receiving the A triplets. */
if ( procs > 1 ) {
if ( !(send_req = (MPI_Request *)
SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) )
ABORT("Malloc fails for send_req[].");
if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) )
ABORT("Malloc fails for ia_send[].");
if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) )
ABORT("Malloc fails for aij_send[].");
if ( SendCnt ) { /* count can be zero */
if ( !(index = intMalloc_dist(2*SendCnt)) )
ABORT("Malloc fails for index[].");
if ( !(nzval = doubleMalloc_dist(SendCnt)) )
ABORT("Malloc fails for nzval[].");
}
if ( !(ptr_to_send = intCalloc_dist(procs)) )
ABORT("Malloc fails for ptr_to_send[].");
if ( maxnnzToRecv ) { /* count can be zero */
if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) )
ABORT("Malloc fails for itemp[].");
if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) )
ABORT("Malloc fails for dtemp[].");
}
 
for (i = 0, j = 0, p = 0; p < procs; ++p) {
if ( p != iam ) {
ia_send[p] = &index[i];
i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
aij_send[p] = &nzval[j];
j += nnzToSend[p];
}
}
} /* if procs > 1 */
if ( !(*colptr = intCalloc_dist(n+1)) )
ABORT("Malloc fails for *colptr[].");
 
/* ------------------------------------------------------------
LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
THIS ACCOUNTS FOR THE SECOND PASS OF A.
------------------------------------------------------------*/
nnz_loc = 0; /* Reset the local nonzero count. */
nzval_a = Astore->nzval;
for (i = 0; i < m_loc; ++i) {
for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */
jcol = Astore->colind[j];
gbi = BlockNum( irow );
gbj = BlockNum( jcol );
p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
 
if ( p != iam ) { /* remote */
k = ptr_to_send[p];
ia_send[p][k] = irow;
ia_send[p][k + nnzToSend[p]] = jcol;
aij_send[p][k] = nzval_a[j];
++ptr_to_send[p];
} else { /* local */
ia[nnz_loc] = irow;
ja[nnz_loc] = jcol;
aij[nnz_loc] = nzval_a[j];
++nnz_loc;
++(*colptr)[jcol]; /* Count nonzeros in each column */
}
}
}
 
/* ------------------------------------------------------------
PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
NOTE: Can possibly use MPI_Alltoallv.
------------------------------------------------------------*/
for (p = 0; p < procs; ++p) {
if ( p != iam ) {
it = 2*nnzToSend[p];
MPI_Isend( ia_send[p], it, mpi_int_t,
p, iam, grid->comm, &send_req[p] );
it = nnzToSend[p];
MPI_Isend( aij_send[p], it, MPI_DOUBLE,
p, iam+procs, grid->comm, &send_req[procs+p] );
}
}
 
for (p = 0; p < procs; ++p) {
if ( p != iam ) {
it = 2*nnzToRecv[p];
MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status );
it = nnzToRecv[p];
MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs,
grid->comm, &status );
for (i = 0; i < nnzToRecv[p]; ++i) {
ia[nnz_loc] = itemp[i];
jcol = itemp[i + nnzToRecv[p]];
/*assert(jcol<n);*/
ja[nnz_loc] = jcol;
aij[nnz_loc] = dtemp[i];
++nnz_loc;
++(*colptr)[jcol]; /* Count nonzeros in each column */
}
}
}
 
for (p = 0; p < procs; ++p) {
if ( p != iam ) {
MPI_Wait( &send_req[p], &status);
MPI_Wait( &send_req[procs+p], &status);
}
}
 
/* ------------------------------------------------------------
DEALLOCATE TEMPORARY STORAGE
------------------------------------------------------------*/
 
SUPERLU_FREE(nnzToRecv);
 
if ( procs > 1 ) {
SUPERLU_FREE(send_req);
SUPERLU_FREE(ia_send);
SUPERLU_FREE(aij_send);
if ( SendCnt ) {
SUPERLU_FREE(index);
SUPERLU_FREE(nzval);
}
SUPERLU_FREE(ptr_to_send);
if ( maxnnzToRecv ) {
SUPERLU_FREE(itemp);
SUPERLU_FREE(dtemp);
}
}
 
/* ------------------------------------------------------------
CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT.
------------------------------------------------------------*/
if ( nnz_loc ) { /* nnz_loc can be zero */
if ( !(*rowind = intMalloc_dist(nnz_loc)) )
ABORT("Malloc fails for *rowind[].");
if ( !(*a = doubleMalloc_dist(nnz_loc)) )
ABORT("Malloc fails for *a[].");
}
 
/* Initialize the array of column pointers */
k = 0;
jsize = (*colptr)[0];
(*colptr)[0] = 0;
for (j = 1; j < n; ++j) {
k += jsize;
jsize = (*colptr)[j];
(*colptr)[j] = k;
}
/* Copy the triplets into the column oriented storage */
for (i = 0; i < nnz_loc; ++i) {
j = ja[i];
k = (*colptr)[j];
(*rowind)[k] = ia[i];
(*a)[k] = aij[i];
++(*colptr)[j];
}
 
/* Reset the column pointers to the beginning of each column */
for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1];
(*colptr)[0] = 0;
 
if ( nnz_loc ) {
SUPERLU_FREE(ia);
SUPERLU_FREE(aij);
}
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit dReDistribute_A()");
#endif
} /* dReDistribute_A */
 
float
pddistribute(fact_t fact, int_t n, SuperMatrix *A,
ScalePermstruct_t *ScalePermstruct,
Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct,
gridinfo_t *grid)
/*
* -- Distributed SuperLU routine (version 2.0) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* March 15, 2003
*
*
* Purpose
* =======
* Distribute the matrix onto the 2D process mesh.
*
* Arguments
* =========
*
* fact (input) fact_t
* Specifies whether or not the L and U structures will be re-used.
* = SamePattern_SameRowPerm: L and U structures are input, and
* unchanged on exit.
* = DOFACT or SamePattern: L and U structures are computed and output.
*
* n (input) int
* Dimension of the matrix.
*
* A (input) SuperMatrix*
* The distributed input matrix A of dimension (A->nrow, A->ncol).
* A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be:
* Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
*
* ScalePermstruct (input) ScalePermstruct_t*
* The data structure to store the scaling and permutation vectors
* describing the transformations performed to the original matrix A.
*
* Glu_freeable (input) *Glu_freeable_t
* The global structure describing the graph of L and U.
*
* LUstruct (input) LUstruct_t*
* Data structures for L and U factors.
*
* grid (input) gridinfo_t*
* The 2D process mesh.
*
* Return value
* ============
* > 0, working storage required (in bytes).
*
*/
{
Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
LocalLU_t *Llu = LUstruct->Llu;
int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k,
len, len1, nsupc;
int_t ljb; /* local block column number */
int_t nrbl; /* number of L blocks in current block column */
int_t nrbu; /* number of U blocks in current block column */
int_t gb; /* global block number; 0 < gb <= nsuper */
int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
int iam, jbrow, kcol, mycol, myrow, pc, pr;
int_t mybufmax[NBUFFERS];
NRformat_loc *Astore;
double *a;
int_t *asub, *xa;
int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */
int_t *supno = Glu_persist->supno;
int_t *lsub, *xlsub, *usub, *xusub;
int_t nsupers;
int_t next_lind; /* next available position in index[*] */
int_t next_lval; /* next available position in nzval[*] */
int_t *index; /* indices consist of headers and row subscripts */
double *lusup, *uval; /* nonzero values in L and U */
double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */
int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
 
/*-- Counts to be used in factorization. --*/
int_t *ToRecv, *ToSendD, **ToSendR;
 
/*-- Counts to be used in lower triangular solve. --*/
int_t *fmod; /* Modification count for L-solve. */
int_t **fsendx_plist; /* Column process list to send down Xk. */
int_t nfrecvx = 0; /* Number of Xk I will receive. */
int_t nfsendx = 0; /* Number of Xk I will send */
int_t kseen;
 
/*-- Counts to be used in upper triangular solve. --*/
int_t *bmod; /* Modification count for U-solve. */
int_t **bsendx_plist; /* Column process list to send down Xk. */
int_t nbrecvx = 0; /* Number of Xk I will receive. */
int_t nbsendx = 0; /* Number of Xk I will send */
int_t *ilsum; /* starting position of each supernode in
the full array (local) */
 
/*-- Auxiliary arrays; freed on return --*/
int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */
int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */
int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */
int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */
int_t *Ucbs; /* number of column blocks in a block row */
int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */
int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */
int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */
int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */
double *dense, *dense_col; /* SPA */
double zero = 0.0;
int_t ldaspa; /* LDA of SPA */
int_t iword, dword;
float mem_use = 0.0;
 
#if ( PRNTlevel>=1 )
int_t nLblocks = 0, nUblocks = 0;
#endif
#if ( PROFlevel>=1 )
double t, t_u, t_l;
int_t u_blks;
#endif
 
/* Initialization. */
iam = grid->iam;
myrow = MYROW( iam, grid );
mycol = MYCOL( iam, grid );
for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
nsupers = supno[n-1] + 1;
Astore = (NRformat_loc *) A->Store;
 
#if ( PRNTlevel>=1 )
iword = sizeof(int_t);
dword = sizeof(double);
#endif
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Enter pddistribute()");
#endif
#if ( PROFlevel>=1 )
t = SuperLU_timer_();
#endif
 
dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno,
grid, &xa, &asub, &a);
 
#if ( PROFlevel>=1 )
t = SuperLU_timer_() - t;
if ( !iam ) printf("--------\n"
".. Phase 1 - ReDistribute_A time: %.2f\t\n", t);
#endif
 
if ( fact == SamePattern_SameRowPerm ) {
 
#if ( PROFlevel>=1 )
t_l = t_u = 0; u_blks = 0;
#endif
/* We can propagate the new values of A into the existing
L and U data structures. */
ilsum = Llu->ilsum;
ldaspa = Llu->ldalsum;
if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) )
ABORT("Calloc fails for SPA dense[].");
nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */
if ( !(Urb_length = intCalloc_dist(nrbu)) )
ABORT("Calloc fails for Urb_length[].");
if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
ABORT("Malloc fails for Urb_indptr[].");
Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
Unzval_br_ptr = Llu->Unzval_br_ptr;
#if ( PRNTlevel>=1 )
mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword;
#endif
#if ( PROFlevel>=1 )
t = SuperLU_timer_();
#endif
 
/* Initialize Uval to zero. */
for (lb = 0; lb < nrbu; ++lb) {
Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
index = Ufstnz_br_ptr[lb];
if ( index ) {
uval = Unzval_br_ptr[lb];
len = index[1];
for (i = 0; i < len; ++i) uval[i] = zero;
} /* if index != NULL */
} /* for lb ... */
 
for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
pc = PCOL( jb, grid );
if ( mycol == pc ) { /* Block column jb in my process column */
fsupc = FstBlockC( jb );
nsupc = SuperSize( jb );
 
/* Scatter A into SPA (for L), or into U directly. */
for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
for (i = xa[j]; i < xa[j+1]; ++i) {
irow = asub[i];
gb = BlockNum( irow );
if ( myrow == PROW( gb, grid ) ) {
lb = LBi( gb, grid );
if ( gb < jb ) { /* in U */
index = Ufstnz_br_ptr[lb];
uval = Unzval_br_ptr[lb];
while ( (k = index[Urb_indptr[lb]]) < jb ) {
/* Skip nonzero values in this block */
Urb_length[lb] += index[Urb_indptr[lb]+1];
/* Move pointer to the next block */
Urb_indptr[lb] += UB_DESCRIPTOR
+ SuperSize( k );
}
/*assert(k == jb);*/
/* start fstnz */
istart = Urb_indptr[lb] + UB_DESCRIPTOR;
len = Urb_length[lb];
fsupc1 = FstBlockC( gb+1 );
k = j - fsupc;
/* Sum the lengths of the leading columns */
for (jj = 0; jj < k; ++jj)
len += fsupc1 - index[istart++];
/*assert(irow>=index[istart]);*/
uval[len + irow - index[istart]] = a[i];
} else { /* in L; put in SPA first */
irow = ilsum[lb] + irow - FstBlockC( gb );
dense_col[irow] = a[i];
}
}
} /* for i ... */
dense_col += ldaspa;
} /* for j ... */
 
#if ( PROFlevel>=1 )
t_u += SuperLU_timer_() - t;
t = SuperLU_timer_();
#endif
 
/* Gather the values of A from SPA into Lnzval[]. */
ljb = LBj( jb, grid ); /* Local block number */
index = Lrowind_bc_ptr[ljb];
if ( index ) {
nrbl = index[0]; /* Number of row blocks. */
len = index[1]; /* LDA of lusup[]. */
lusup = Lnzval_bc_ptr[ljb];
next_lind = BC_HEADER;
next_lval = 0;
for (jj = 0; jj < nrbl; ++jj) {
gb = index[next_lind++];
len1 = index[next_lind++]; /* Rows in the block. */
lb = LBi( gb, grid );
for (bnnz = 0; bnnz < len1; ++bnnz) {
irow = index[next_lind++]; /* Global index. */
irow = ilsum[lb] + irow - FstBlockC( gb );
k = next_lval++;
for (j = 0, dense_col = dense; j < nsupc; ++j) {
lusup[k] = dense_col[irow];
dense_col[irow] = zero;
k += len;
dense_col += ldaspa;
}
} /* for bnnz ... */
} /* for jj ... */
} /* if index ... */
#if ( PROFlevel>=1 )
t_l += SuperLU_timer_() - t;
#endif
} /* if mycol == pc */
} /* for jb ... */
 
SUPERLU_FREE(dense);
SUPERLU_FREE(Urb_length);
SUPERLU_FREE(Urb_indptr);
#if ( PROFlevel>=1 )
if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
t_l, t_u, u_blks, nrbu);
#endif
 
} else {
/* ------------------------------------------------------------
FIRST TIME CREATING THE L AND U DATA STRUCTURES.
------------------------------------------------------------*/
 
#if ( PROFlevel>=1 )
t_l = t_u = 0; u_blks = 0;
#endif
/* We first need to set up the L and U data structures and then
* propagate the values of A into them.
*/
lsub = Glu_freeable->lsub; /* compressed L subscripts */
xlsub = Glu_freeable->xlsub;
usub = Glu_freeable->usub; /* compressed U subscripts */
xusub = Glu_freeable->xusub;
if ( !(ToRecv = intCalloc_dist(nsupers)) )
ABORT("Calloc fails for ToRecv[].");
 
k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
ABORT("Malloc fails for ToSendR[].");
j = k * grid->npcol;
if ( !(index = intMalloc_dist(j)) )
ABORT("Malloc fails for index[].");
#if ( PRNTlevel>=1 )
mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword;
#endif
for (i = 0; i < j; ++i) index[i] = EMPTY;
for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j];
k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
 
/* Pointers to the beginning of each block row of U. */
if ( !(Unzval_br_ptr =
(double**)SUPERLU_MALLOC(k * sizeof(double*))) )
ABORT("Malloc fails for Unzval_br_ptr[].");
if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Ufstnz_br_ptr[].");
if ( !(ToSendD = intCalloc_dist(k)) )
ABORT("Malloc fails for ToSendD[].");
if ( !(ilsum = intMalloc_dist(k+1)) )
ABORT("Malloc fails for ilsum[].");
 
/* Auxiliary arrays used to set up U block data structures.
They are freed on return. */
if ( !(rb_marker = intCalloc_dist(k)) )
ABORT("Calloc fails for rb_marker[].");
if ( !(Urb_length = intCalloc_dist(k)) )
ABORT("Calloc fails for Urb_length[].");
if ( !(Urb_indptr = intMalloc_dist(k)) )
ABORT("Malloc fails for Urb_indptr[].");
if ( !(Urb_fstnz = intCalloc_dist(k)) )
ABORT("Calloc fails for Urb_fstnz[].");
if ( !(Ucbs = intCalloc_dist(k)) )
ABORT("Calloc fails for Ucbs[].");
#if ( PRNTlevel>=1 )
mem_use += 2.0*k*sizeof(int_t*) + (7*k+1)*iword;
#endif
/* Compute ldaspa and ilsum[]. */
ldaspa = 0;
ilsum[0] = 0;
for (gb = 0; gb < nsupers; ++gb) {
if ( myrow == PROW( gb, grid ) ) {
i = SuperSize( gb );
ldaspa += i;
lb = LBi( gb, grid );
ilsum[lb + 1] = ilsum[lb] + i;
}
}
#if ( PROFlevel>=1 )
t = SuperLU_timer_();
#endif
/* ------------------------------------------------------------
COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
------------------------------------------------------------*/
/* Loop through each supernode column. */
for (jb = 0; jb < nsupers; ++jb) {
pc = PCOL( jb, grid );
fsupc = FstBlockC( jb );
nsupc = SuperSize( jb );
/* Loop through each column in the block. */
for (j = fsupc; j < fsupc + nsupc; ++j) {
/* usub[*] contains only "first nonzero" in each segment. */
for (i = xusub[j]; i < xusub[j+1]; ++i) {
irow = usub[i]; /* First nonzero of the segment. */
gb = BlockNum( irow );
kcol = PCOL( gb, grid );
ljb = LBj( gb, grid );
if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
pr = PROW( gb, grid );
lb = LBi( gb, grid );
if ( mycol == pc ) {
if ( myrow == pr ) {
ToSendD[lb] = YES;
/* Count nonzeros in entire block row. */
Urb_length[lb] += FstBlockC( gb+1 ) - irow;
if (rb_marker[lb] <= jb) {/* First see the block */
rb_marker[lb] = jb + 1;
Urb_fstnz[lb] += nsupc;
++Ucbs[lb]; /* Number of column blocks
in block row lb. */
#if ( PRNTlevel>=1 )
++nUblocks;
#endif
}
ToRecv[gb] = 1;
} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
}
} /* for i ... */
} /* for j ... */
} /* for jb ... */
/* Set up the initial pointers for each block row in U. */
nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
for (lb = 0; lb < nrbu; ++lb) {
len = Urb_length[lb];
rb_marker[lb] = 0; /* Reset block marker. */
if ( len ) {
/* Add room for descriptors */
len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
if ( !(index = intMalloc_dist(len1+1)) )
ABORT("Malloc fails for Uindex[].");
Ufstnz_br_ptr[lb] = index;
if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) )
ABORT("Malloc fails for Unzval_br_ptr[*][].");
mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
index[0] = Ucbs[lb]; /* Number of column blocks */
index[1] = len; /* Total length of nzval[] */
index[2] = len1; /* Total length of index[] */
index[len1] = -1; /* End marker */
} else {
Ufstnz_br_ptr[lb] = NULL;
Unzval_br_ptr[lb] = NULL;
}
Urb_length[lb] = 0; /* Reset block length. */
Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
Urb_fstnz[lb] = BR_HEADER;
} /* for lb ... */
 
SUPERLU_FREE(Ucbs);
 
#if ( PROFlevel>=1 )
t = SuperLU_timer_() - t;
if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t);
#endif
#if ( PRNTlevel>=1 )
mem_use -= 2.0*k * iword;
#endif
/* Auxiliary arrays used to set up L block data structures.
They are freed on return.
k is the number of local row blocks. */
if ( !(Lrb_length = intCalloc_dist(k)) )
ABORT("Calloc fails for Lrb_length[].");
if ( !(Lrb_number = intMalloc_dist(k)) )
ABORT("Malloc fails for Lrb_number[].");
if ( !(Lrb_indptr = intMalloc_dist(k)) )
ABORT("Malloc fails for Lrb_indptr[].");
if ( !(Lrb_valptr = intMalloc_dist(k)) )
ABORT("Malloc fails for Lrb_valptr[].");
if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) )
ABORT("Calloc fails for SPA dense[].");
 
/* These counts will be used for triangular solves. */
if ( !(fmod = intCalloc_dist(k)) )
ABORT("Calloc fails for fmod[].");
if ( !(bmod = intCalloc_dist(k)) )
ABORT("Calloc fails for bmod[].");
 
/* ------------------------------------------------ */
#if ( PRNTlevel>=1 )
mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword;
#endif
k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
 
/* Pointers to the beginning of each block column of L. */
if ( !(Lnzval_bc_ptr =
(double**)SUPERLU_MALLOC(k * sizeof(double*))) )
ABORT("Malloc fails for Lnzval_bc_ptr[].");
if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Lrowind_bc_ptr[].");
Lrowind_bc_ptr[k-1] = NULL;
 
/* These lists of processes will be used for triangular solves. */
if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
ABORT("Malloc fails for fsendx_plist[].");
len = k * grid->nprow;
if ( !(index = intMalloc_dist(len)) )
ABORT("Malloc fails for fsendx_plist[0]");
for (i = 0; i < len; ++i) index[i] = EMPTY;
for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
fsendx_plist[i] = &index[j];
if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
ABORT("Malloc fails for bsendx_plist[].");
if ( !(index = intMalloc_dist(len)) )
ABORT("Malloc fails for bsendx_plist[0]");
for (i = 0; i < len; ++i) index[i] = EMPTY;
for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
bsendx_plist[i] = &index[j];
/* -------------------------------------------------------------- */
#if ( PRNTlevel>=1 )
mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
#endif
 
/*------------------------------------------------------------
PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
------------------------------------------------------------*/
 
for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
pc = PCOL( jb, grid );
if ( mycol == pc ) { /* Block column jb in my process column */
fsupc = FstBlockC( jb );
nsupc = SuperSize( jb );
ljb = LBj( jb, grid ); /* Local block number */
/* Scatter A into SPA. */
for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
for (i = xa[j]; i < xa[j+1]; ++i) {
irow = asub[i];
gb = BlockNum( irow );
if ( myrow == PROW( gb, grid ) ) {
lb = LBi( gb, grid );
irow = ilsum[lb] + irow - FstBlockC( gb );
dense_col[irow] = a[i];
}
}
dense_col += ldaspa;
} /* for j ... */
 
jbrow = PROW( jb, grid );
 
/*------------------------------------------------
* SET UP U BLOCKS.
*------------------------------------------------*/
#if ( PROFlevel>=1 )
t = SuperLU_timer_();
#endif
kseen = 0;
dense_col = dense;
/* Loop through each column in the block column. */
for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
istart = xusub[j];
/* NOTE: Only the first nonzero index of the segment
is stored in usub[]. */
for (i = istart; i < xusub[j+1]; ++i) {
irow = usub[i]; /* First nonzero in the segment. */
gb = BlockNum( irow );
pr = PROW( gb, grid );
if ( pr != jbrow &&
myrow == jbrow && /* diag. proc. owning jb */
bsendx_plist[ljb][pr] == EMPTY ) {
bsendx_plist[ljb][pr] = YES;
++nbsendx;
}
if ( myrow == pr ) {
lb = LBi( gb, grid ); /* Local block number */
index = Ufstnz_br_ptr[lb];
uval = Unzval_br_ptr[lb];
fsupc1 = FstBlockC( gb+1 );
if (rb_marker[lb] <= jb) { /* First time see
the block */
rb_marker[lb] = jb + 1;
Urb_indptr[lb] = Urb_fstnz[lb];;
index[Urb_indptr[lb]] = jb; /* Descriptor */
Urb_indptr[lb] += UB_DESCRIPTOR;
/* Record the first location in index[] of the
next block */
Urb_fstnz[lb] = Urb_indptr[lb] + nsupc;
len = Urb_indptr[lb];/* Start fstnz in index */
index[len-1] = 0;
for (k = 0; k < nsupc; ++k)
index[len+k] = fsupc1;
if ( gb != jb )/* Exclude diagonal block. */
++bmod[lb];/* Mod. count for back solve */
if ( kseen == 0 && myrow != jbrow ) {
++nbrecvx;
kseen = 1;
}
} else { /* Already saw the block */
len = Urb_indptr[lb];/* Start fstnz in index */
}
jj = j - fsupc;
index[len+jj] = irow;
/* Load the numerical values */
k = fsupc1 - irow; /* No. of nonzeros in segment */
index[len-1] += k; /* Increment block length in
Descriptor */
irow = ilsum[lb] + irow - FstBlockC( gb );
for (ii = 0; ii < k; ++ii) {
uval[Urb_length[lb]++] = dense_col[irow + ii];
dense_col[irow + ii] = zero;
}
} /* if myrow == pr ... */
} /* for i ... */
dense_col += ldaspa;
} /* for j ... */
 
#if ( PROFlevel>=1 )
t_u += SuperLU_timer_() - t;
t = SuperLU_timer_();
#endif
/*------------------------------------------------
* SET UP L BLOCKS.
*------------------------------------------------*/
 
/* Count number of blocks and length of each block. */
nrbl = 0;
len = 0; /* Number of row subscripts I own. */
kseen = 0;
istart = xlsub[fsupc];
for (i = istart; i < xlsub[fsupc+1]; ++i) {
irow = lsub[i];
gb = BlockNum( irow ); /* Global block number */
pr = PROW( gb, grid ); /* Process row owning this block */
if ( pr != jbrow &&
myrow == jbrow && /* diag. proc. owning jb */
fsendx_plist[ljb][pr] == EMPTY /* first time */ ) {
fsendx_plist[ljb][pr] = YES;
++nfsendx;
}
if ( myrow == pr ) {
lb = LBi( gb, grid ); /* Local block number */
if (rb_marker[lb] <= jb) { /* First see this block */
rb_marker[lb] = jb + 1;
Lrb_length[lb] = 1;
Lrb_number[nrbl++] = gb;
if ( gb != jb ) /* Exclude diagonal block. */
++fmod[lb]; /* Mod. count for forward solve */
if ( kseen == 0 && myrow != jbrow ) {
++nfrecvx;
kseen = 1;
}
#if ( PRNTlevel>=1 )
++nLblocks;
#endif
} else {
++Lrb_length[lb];
}
++len;
}
} /* for i ... */
 
if ( nrbl ) { /* Do not ensure the blocks are sorted! */
/* Set up the initial pointers for each block in
index[] and nzval[]. */
/* Add room for descriptors */
len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
if ( !(index = intMalloc_dist(len1)) )
ABORT("Malloc fails for index[]");
Lrowind_bc_ptr[ljb] = index;
if (!(Lnzval_bc_ptr[ljb] =
doubleMalloc_dist(len*nsupc))) {
fprintf(stderr, "col block %d ", jb);
ABORT("Malloc fails for Lnzval_bc_ptr[*][]");
}
mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
index[0] = nrbl; /* Number of row blocks */
index[1] = len; /* LDA of the nzval[] */
next_lind = BC_HEADER;
next_lval = 0;
for (k = 0; k < nrbl; ++k) {
gb = Lrb_number[k];
lb = LBi( gb, grid );
len = Lrb_length[lb];
Lrb_length[lb] = 0; /* Reset vector of block length */
index[next_lind++] = gb; /* Descriptor */
index[next_lind++] = len;
Lrb_indptr[lb] = next_lind;
Lrb_valptr[lb] = next_lval;
next_lind += len;
next_lval += len;
}
/* Propagate the compressed row subscripts to Lindex[],
and the initial values of A from SPA into Lnzval[]. */
lusup = Lnzval_bc_ptr[ljb];
len = index[1]; /* LDA of lusup[] */
for (i = istart; i < xlsub[fsupc+1]; ++i) {
irow = lsub[i];
gb = BlockNum( irow );
if ( myrow == PROW( gb, grid ) ) {
lb = LBi( gb, grid );
k = Lrb_indptr[lb]++; /* Random access a block */
index[k] = irow;
k = Lrb_valptr[lb]++;
irow = ilsum[lb] + irow - FstBlockC( gb );
for (j = 0, dense_col = dense; j < nsupc; ++j) {
lusup[k] = dense_col[irow];
dense_col[irow] = zero;
k += len;
dense_col += ldaspa;
}
}
} /* for i ... */
} else {
Lrowind_bc_ptr[ljb] = NULL;
Lnzval_bc_ptr[ljb] = NULL;
} /* if nrbl ... */
#if ( PROFlevel>=1 )
t_l += SuperLU_timer_() - t;
#endif
} /* if mycol == pc */
 
} /* for jb ... */
 
Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
Llu->Unzval_br_ptr = Unzval_br_ptr;
Llu->ToRecv = ToRecv;
Llu->ToSendD = ToSendD;
Llu->ToSendR = ToSendR;
Llu->fmod = fmod;
Llu->fsendx_plist = fsendx_plist;
Llu->nfrecvx = nfrecvx;
Llu->nfsendx = nfsendx;
Llu->bmod = bmod;
Llu->bsendx_plist = bsendx_plist;
Llu->nbrecvx = nbrecvx;
Llu->nbsendx = nbsendx;
Llu->ilsum = ilsum;
Llu->ldalsum = ldaspa;
#if ( PRNTlevel>=1 )
if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n",
nLblocks, nUblocks);
#endif
 
SUPERLU_FREE(rb_marker);
SUPERLU_FREE(Urb_fstnz);
SUPERLU_FREE(Urb_length);
SUPERLU_FREE(Urb_indptr);
SUPERLU_FREE(Lrb_length);
SUPERLU_FREE(Lrb_number);
SUPERLU_FREE(Lrb_indptr);
SUPERLU_FREE(Lrb_valptr);
SUPERLU_FREE(dense);
 
/* Find the maximum buffer size. */
MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t,
MPI_MAX, grid->comm);
 
k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
if ( !(Llu->mod_bit = intMalloc_dist(k)) )
ABORT("Malloc fails for mod_bit[].");
 
#if ( PROFlevel>=1 )
if ( !iam ) printf(".. 1st distribute time:\n "
"\tL\t%.2f\n\tU\t%.2f\n"
"\tu_blks %d\tnrbu %d\n--------\n",
t_l, t_u, u_blks, nrbu);
#endif
 
} /* else fact != SamePattern_SameRowPerm */
 
if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */
SUPERLU_FREE(asub);
SUPERLU_FREE(a);
}
SUPERLU_FREE(xa);
 
#if ( DEBUGlevel>=1 )
/* Memory allocated but not freed:
ilsum, fmod, fsendx_plist, bmod, bsendx_plist */
CHECK_MALLOC(iam, "Exit pddistribute()");
#endif
return (mem_use);
} /* PDDISTRIBUTE */
/trunk/OTHER/SuperLU_DIST_2.5/SRC/dreadrb.c
New file
0,0 → 1,240
 
/*! @file dreadrb.c
* \brief Read a matrix stored in Rutherford-Boeing format
*
* <pre>
* -- Distributed SuperLU routine (version 2.3) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* July 15, 2009
* </pre>
*
* Purpose
* =======
*
* Read a DOUBLE PRECISION matrix stored in Rutherford-Boeing format
* as described below.
*
* Line 1 (A72, A8)
* Col. 1 - 72 Title (TITLE)
* Col. 73 - 80 Matrix name / identifier (MTRXID)
*
* Line 2 (I14, 3(1X, I13))
* Col. 1 - 14 Total number of lines excluding header (TOTCRD)
* Col. 16 - 28 Number of lines for pointers (PTRCRD)
* Col. 30 - 42 Number of lines for row (or variable) indices (INDCRD)
* Col. 44 - 56 Number of lines for numerical values (VALCRD)
*
* Line 3 (A3, 11X, 4(1X, I13))
* Col. 1 - 3 Matrix type (see below) (MXTYPE)
* Col. 15 - 28 Compressed Column: Number of rows (NROW)
* Elemental: Largest integer used to index variable (MVAR)
* Col. 30 - 42 Compressed Column: Number of columns (NCOL)
* Elemental: Number of element matrices (NELT)
* Col. 44 - 56 Compressed Column: Number of entries (NNZERO)
* Elemental: Number of variable indeces (NVARIX)
* Col. 58 - 70 Compressed Column: Unused, explicitly zero
* Elemental: Number of elemental matrix entries (NELTVL)
*
* Line 4 (2A16, A20)
* Col. 1 - 16 Fortran format for pointers (PTRFMT)
* Col. 17 - 32 Fortran format for row (or variable) indices (INDFMT)
* Col. 33 - 52 Fortran format for numerical values of coefficient matrix
* (VALFMT)
* (blank in the case of matrix patterns)
*
* The three character type field on line 3 describes the matrix type.
* The following table lists the permitted values for each of the three
* characters. As an example of the type field, RSA denotes that the matrix
* is real, symmetric, and assembled.
*
* First Character:
* R Real matrix
* C Complex matrix
* I integer matrix
* P Pattern only (no numerical values supplied)
* Q Pattern only (numerical values supplied in associated auxiliary value
* file)
*
* Second Character:
* S Symmetric
* U Unsymmetric
* H Hermitian
* Z Skew symmetric
* R Rectangular
*
* Third Character:
* A Compressed column form
* E Elemental form
*
* </pre>
*/
 
#include "superlu_ddefs.h"
 
 
/*! \brief Eat up the rest of the current line */
static int dDumpLine(FILE *fp)
{
register int c;
while ((c = fgetc(fp)) != '\n') ;
return 0;
}
 
static int dParseIntFormat(char *buf, int *num, int *size)
{
char *tmp;
 
tmp = buf;
while (*tmp++ != '(') ;
sscanf(tmp, "%d", num);
while (*tmp != 'I' && *tmp != 'i') ++tmp;
++tmp;
sscanf(tmp, "%d", size);
return 0;
}
 
static int dParseFloatFormat(char *buf, int *num, int *size)
{
char *tmp, *period;
 
tmp = buf;
while (*tmp++ != '(') ;
*num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd'
&& *tmp != 'F' && *tmp != 'f') {
/* May find kP before nE/nD/nF, like (1P6F13.6). In this case the
num picked up refers to P, which should be skipped. */
if (*tmp=='p' || *tmp=='P') {
++tmp;
*num = atoi(tmp); /*sscanf(tmp, "%d", num);*/
} else {
++tmp;
}
}
++tmp;
period = tmp;
while (*period != '.' && *period != ')') ++period ;
*period = '\0';
*size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/
 
return 0;
}
 
static int ReadVector(FILE *fp, int n, int *where, int perline, int persize)
{
register int i, j, item;
char tmp, buf[100];
 
i = 0;
while (i < n) {
fgets(buf, 100, fp); /* read a line at a time */
for (j=0; j<perline && i<n; j++) {
tmp = buf[(j+1)*persize]; /* save the char at that place */
buf[(j+1)*persize] = 0; /* null terminate */
item = atoi(&buf[j*persize]);
buf[(j+1)*persize] = tmp; /* recover the char at that place */
where[i++] = item - 1;
}
}
 
return 0;
}
 
static int dReadValues(FILE *fp, int n, double *destination, int perline,
int persize)
{
register int i, j, k, s;
char tmp, buf[100];
 
i = 0;
while (i < n) {
fgets(buf, 100, fp); /* read a line at a time */
for (j=0; j<perline && i<n; j++) {
tmp = buf[(j+1)*persize]; /* save the char at that place */
buf[(j+1)*persize] = 0; /* null terminate */
s = j*persize;
for (k = 0; k < persize; ++k) /* No D_ format in C */
if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E';
destination[i++] = atof(&buf[s]);
buf[(j+1)*persize] = tmp; /* recover the char at that place */
}
}
 
return 0;
}
 
 
 
void
dreadrb(FILE *fp, int *nrow, int *ncol, int *nonz,
double **nzval, int **rowind, int **colptr)
{
 
register int i, numer_lines = 0;
int tmp, colnum, colsize, rownum, rowsize, valnum, valsize;
char buf[100], type[4];
 
 
/* Line 1 */
fgets(buf, 100, fp);
fputs(buf, stdout);
 
/* Line 2 */
for (i=0; i<4; i++) {
fscanf(fp, "%14c", buf); buf[14] = 0;
sscanf(buf, "%d", &tmp);
if (i == 3) numer_lines = tmp;
}
dDumpLine(fp);
 
/* Line 3 */
fscanf(fp, "%3c", type);
fscanf(fp, "%11c", buf); /* pad */
type[3] = 0;
#ifdef DEBUG
printf("Matrix type %s\n", type);
#endif
 
fscanf(fp, "%14c", buf); sscanf(buf, "%d", nrow);
fscanf(fp, "%14c", buf); sscanf(buf, "%d", ncol);
fscanf(fp, "%14c", buf); sscanf(buf, "%d", nonz);
fscanf(fp, "%14c", buf); sscanf(buf, "%d", &tmp);
 
if (tmp != 0)
printf("This is not an assembled matrix!\n");
if (*nrow != *ncol)
printf("Matrix is not square.\n");
dDumpLine(fp);
 
/* Allocate storage for the three arrays ( nzval, rowind, colptr ) */
dallocateA_dist(*ncol, *nonz, nzval, rowind, colptr);
 
/* Line 4: format statement */
fscanf(fp, "%16c", buf);
dParseIntFormat(buf, &colnum, &colsize);
fscanf(fp, "%16c", buf);
dParseIntFormat(buf, &rownum, &rowsize);
fscanf(fp, "%20c", buf);
dParseFloatFormat(buf, &valnum, &valsize);
dDumpLine(fp);
 
#ifdef DEBUG
printf("%d rows, %d nonzeros\n", *nrow, *nonz);
printf("colnum %d, colsize %d\n", colnum, colsize);
printf("rownum %d, rowsize %d\n", rownum, rowsize);
printf("valnum %d, valsize %d\n", valnum, valsize);
#endif
 
ReadVector(fp, *ncol+1, *colptr, colnum, colsize);
ReadVector(fp, *nonz, *rowind, rownum, rowsize);
if ( numer_lines ) {
dReadValues(fp, *nonz, *nzval, valnum, valsize);
}
 
//sym = (type[1] == 'S' || type[1] == 's');
//if ( sym ) {
FormFullA(*ncol, nonz, nzval, rowind, colptr);
//}
 
fclose(fp);
}
/trunk/OTHER/SuperLU_DIST_2.5/SRC/memory.c.orig
New file
0,0 → 1,546
/*
* -- Distributed SuperLU routine (version 1.0) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* September 1, 1999
*
*/
 
#include "superlu_ddefs.h"
 
#define PRNTlevel 0
/*
* Global variables
*/
ExpHeader *expanders; /* Array of pointers to 4 types of memory */
LU_stack_t stack;
int_t no_expand;
 
 
/*
* Prototype
*/
static int_t memory_usage(const int_t, const int_t, const int_t);
static void *expand(int_t *, MemType, int_t, int_t,
Glu_freeable_t *);
 
/*
* Internal prototypes
*/
void SetupSpace (void *, int_t, LU_space_t *);
 
 
void
superlu_abort_and_exit_dist(char *msg)
{
/*fprintf(stderr, msg);
fflush(stderr);*/
printf(msg);
exit (-1);
}
 
long int superlu_malloc_total = 0;
 
#if ( DEBUGlevel>=1 ) /* Debug malloc/free. */
 
#define PAD_FACTOR 2
#define DWORD (sizeof(double)) /* Be sure it's no smaller than double. */
 
void *superlu_malloc_dist(size_t size)
{
char *buf;
int iam;
 
MPI_Comm_rank(MPI_COMM_WORLD, &iam);
buf = (char *) malloc(size + DWORD);
if ( !buf ) {
printf("(%d) superlu_malloc fails: malloc_total %.0f MB, size %d\n",
iam, superlu_malloc_total*1e-6, size);
ABORT("superlu_malloc: out of memory");
}
 
((int_t *) buf)[0] = size;
#if 0
superlu_malloc_total += size + DWORD;
#else
superlu_malloc_total += size;
#endif
return (void *) (buf + DWORD);
}
 
void superlu_free_dist(void *addr)
{
char *p = ((char *) addr) - DWORD;
 
if ( !addr )
ABORT("superlu_free: tried to free NULL pointer");
 
if ( !p )
ABORT("superlu_free: tried to free NULL+DWORD pointer");
 
{
int_t n = ((int_t *) p)[0];
if ( !n )
ABORT("superlu_free: tried to free a freed pointer");
*((int_t *) p) = 0; /* Set to zero to detect duplicate free's. */
#if 0
superlu_malloc_total -= (n + DWORD);
#else
superlu_malloc_total -= n;
#endif
 
if ( superlu_malloc_total < 0 )
ABORT("superlu_malloc_total went negative");
/*free (addr);*/
free (p);
}
 
}
 
#else /* The production mode. */
 
void *superlu_malloc_dist(size_t size)
{
void *buf;
buf = (void *) malloc(size);
return (buf);
}
 
void superlu_free_dist(void *addr)
{
free (addr);
}
 
#endif /* End debug malloc/free. */
 
 
 
static void
copy_mem_int(int_t howmany, void *old, void *new)
{
register int_t i;
int_t *iold = old;
int_t *inew = new;
for (i = 0; i < howmany; i++) inew[i] = iold[i];
}
 
 
static void
user_bcopy(char *src, char *dest, int_t bytes)
{
char *s_ptr, *d_ptr;
 
s_ptr = src + bytes - 1;
d_ptr = dest + bytes - 1;
for (; d_ptr >= dest; --s_ptr, --d_ptr ) *d_ptr = *s_ptr;
}
 
 
 
int_t *intMalloc_dist(int_t n)
{
int_t *buf;
buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
return (buf);
}
 
int_t *intCalloc_dist(int_t n)
{
int_t *buf;
register int_t i;
buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
if ( buf )
for (i = 0; i < n; ++i) buf[i] = 0;
return (buf);
}
 
 
void *user_malloc_dist(int_t bytes, int_t which_end)
{
void *buf;
if ( StackFull(bytes) ) return (NULL);
 
if ( which_end == HEAD ) {
buf = (char*) stack.array + stack.top1;
stack.top1 += bytes;
} else {
stack.top2 -= bytes;
buf = (char*) stack.array + stack.top2;
}
stack.used += bytes;
return buf;
}
 
void user_free_dist(int_t bytes, int_t which_end)
{
if ( which_end == HEAD ) {
stack.top1 -= bytes;
} else {
stack.top2 += bytes;
}
stack.used -= bytes;
}
 
 
/*
* Setup the memory model to be used for factorization.
* lwork = 0: use system malloc;
* lwork > 0: use user-supplied work[] space.
*/
void SetupSpace(void *work, int_t lwork, LU_space_t *MemModel)
{
if ( lwork == 0 ) {
*MemModel = SYSTEM; /* malloc/free */
} else if ( lwork > 0 ) {
*MemModel = USER; /* user provided space */
stack.used = 0;
stack.top1 = 0;
stack.top2 = (lwork/4)*4; /* must be word addressable */
stack.size = stack.top2;
stack.array = (void *) work;
}
}
 
 
 
/*
* Allocate storage for the data structures common to symbolic factorization
* routines. For those unpredictable size, make a guess as FILL * nnz(A).
* Return value:
* If lwork = -1, return the estimated amount of space required, plus n;
* otherwise, return the amount of space actually allocated when
* memory allocation failure occurred.
*/
/************************************************************************/
int_t symbfact_SubInit
/************************************************************************/
(
fact_t fact, void *work, int_t lwork, int_t m, int_t n, int_t annz,
Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable
)
{
int_t iword;
int_t *xsup, *supno;
int_t *lsub, *xlsub;
int_t *usub, *xusub;
int_t nzlmax, nzumax;
int_t FILL = sp_ienv_dist(6);
 
#if ( DEBUGlevel>=1 )
int iam;
MPI_Comm_rank( MPI_COMM_WORLD, &iam );
CHECK_MALLOC(iam, "Enter symbfact_SubInit()");
#endif
 
no_expand = 0;
iword = sizeof(int_t);
 
expanders = (ExpHeader *) SUPERLU_MALLOC( NO_MEMTYPE*sizeof(ExpHeader) );
if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders");
if ( fact == DOFACT || fact == SamePattern ) {
/* Guess for L\U factors */
nzlmax = FILL * annz;
nzumax = FILL/2.0 * annz;
 
if ( lwork == -1 ) {
return ( GluIntArray(n) * iword + TempSpace(m,1)
+ (nzlmax+nzumax)*iword + n );
} else {
SetupSpace(work, lwork, &Glu_freeable->MemModel);
}
#if ( PRNTlevel>=1 )
printf(".. symbfact_SubInit(): annz %ld, nzlmax %ld, nzumax %ld\n",
annz, nzlmax, nzumax);
#endif
/* Integer pointers for L\U factors */
if ( Glu_freeable->MemModel == SYSTEM ) {
xsup = intMalloc_dist(n+1);
supno = intMalloc_dist(n+1);
xlsub = intMalloc_dist(n+1);
xusub = intMalloc_dist(n+1);
} else {
xsup = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
supno = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
xlsub = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
xusub = (int_t *)user_malloc_dist((n+1) * iword, HEAD);
}
 
lsub = (int_t *) expand(&nzlmax, LSUB, 0, 0, Glu_freeable);
usub = (int_t *) expand(&nzumax, USUB, 0, 0, Glu_freeable);
 
while ( !lsub || !usub ) {
if ( Glu_freeable->MemModel == SYSTEM ) {
SUPERLU_FREE(lsub);
SUPERLU_FREE(usub);
} else {
user_free_dist((nzlmax+nzumax)*iword, HEAD);
}
nzlmax /= 2;
nzumax /= 2;
if ( nzumax < annz/2 ) {
printf("Not enough memory to perform factorization.\n");
return (memory_usage(nzlmax, nzumax, n) + n);
}
#if ( PRNTlevel>=1 )
printf(".. symbfact_SubInit() reduce size:"
"nzlmax %ld, nzumax %ld\n", nzlmax, nzumax);
fflush(stdout);
#endif
lsub = (int_t *) expand( &nzlmax, LSUB, 0, 0, Glu_freeable );
usub = (int_t *) expand( &nzumax, USUB, 0, 1, Glu_freeable );
}
 
Glu_persist->xsup = xsup;
Glu_persist->supno = supno;
Glu_freeable->lsub = lsub;
Glu_freeable->xlsub = xlsub;
Glu_freeable->usub = usub;
Glu_freeable->xusub = xusub;
Glu_freeable->nzlmax = nzlmax;
Glu_freeable->nzumax = nzumax;
} else {
/* fact == SamePattern_SameRowPerm */
if ( lwork == -1 ) {
return ( GluIntArray(n) * iword + TempSpace(m, 1)
+ (nzlmax+nzumax)*iword + n );
} else if ( lwork == 0 ) {
Glu_freeable->MemModel = SYSTEM;
} else {
Glu_freeable->MemModel = USER;
stack.top2 = (lwork/4)*4; /* must be word-addressable */
stack.size = stack.top2;
}
expanders[USUB].mem = Glu_freeable->usub;
expanders[LSUB].mem = Glu_freeable->lsub;
expanders[USUB].size = nzumax;
expanders[LSUB].size = nzlmax;
}
 
++no_expand;
 
#if ( DEBUGlevel>=1 )
/* Memory allocated but not freed: xsup, supno */
CHECK_MALLOC(iam, "Exit symbfact_SubInit()");
#endif
 
return 0;
} /* SYMBFACT_SUBINIT */
 
/*
* Expand the data structures for L and U during the factorization.
* Return value: 0 - successful return
* > 0 - number of bytes allocated when run out of space
*/
/************************************************************************/
int_t symbfact_SubXpand
/************************************************************************/
(
int_t n, /* total number of columns */
int_t jcol, /* current column */
int_t next, /* number of elements currently in the factors */
MemType mem_type, /* which type of memory to expand */
int_t *maxlen, /* modified - maximum length of a data structure */
Glu_freeable_t *Glu_freeable /* modified - global LU data structures */
)
{
void *new_mem;
#if ( DEBUGlevel>=1 )
printf("symbfact_SubXpand(): jcol %d, next %ld, maxlen %ld, MemType %d\n",
jcol, next, *maxlen, mem_type);
#endif
 
new_mem = expand(maxlen, mem_type, next, 0, Glu_freeable);
if ( !new_mem ) {
int_t nzlmax = Glu_freeable->nzlmax;
int_t nzumax = Glu_freeable->nzumax;
fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol);
return (memory_usage(nzlmax, nzumax, n) + n);
}
 
if ( mem_type == LSUB ) {
Glu_freeable->lsub = (int_t *) new_mem;
Glu_freeable->nzlmax = *maxlen;
} else if ( mem_type == USUB ) {
Glu_freeable->usub = (int_t *) new_mem;
Glu_freeable->nzumax = *maxlen;
} else ABORT("Tries to expand nonexisting memory type.\n");
return 0;
} /* LUSUB_XPAND */
 
/*
* Deallocate storage of the data structures common to symbolic
* factorization routines.
*/
/************************************************************************/
int_t symbfact_SubFree(Glu_freeable_t *Glu_freeable)
/************************************************************************/
{
#if ( DEBUGlevel>=1 )
int iam;
MPI_Comm_rank( MPI_COMM_WORLD, &iam );
CHECK_MALLOC(iam, "Enter symbfact_SubFree()");
#endif
SUPERLU_FREE(expanders);
SUPERLU_FREE(Glu_freeable->lsub);
SUPERLU_FREE(Glu_freeable->xlsub);
SUPERLU_FREE(Glu_freeable->usub);
SUPERLU_FREE(Glu_freeable->xusub);
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit symbfact_SubFree()");
#endif
return 0;
} /* SYMBFACT_SUBFREE */
 
 
/*
* Expand the existing storage to accommodate more fill-ins.
*/
/************************************************************************/
static void *expand
/************************************************************************/
(
int_t *prev_len, /* length used from previous call */
MemType type, /* which part of the memory to expand */
int_t len_to_copy, /* size of the memory to be copied to new store */
int_t keep_prev, /* = 1: use prev_len;
= 0: compute new_len to expand */
Glu_freeable_t *Glu_freeable /* modified - global LU data structures */
)
{
float EXPAND = 1.5;
float alpha;
void *new_mem;
int_t new_len, tries, lword, extra, bytes_to_copy;
 
alpha = EXPAND;
lword = sizeof(int_t);
 
if ( no_expand == 0 || keep_prev ) /* First time allocate requested */
new_len = *prev_len;
else {
new_len = alpha * *prev_len;
}
 
if ( Glu_freeable->MemModel == SYSTEM ) {
new_mem = (void *) SUPERLU_MALLOC(new_len * lword);
/*new_mem = (void *) calloc(new_len, lword); */
if ( no_expand != 0 ) {
tries = 0;
if ( keep_prev ) {
if ( !new_mem ) return (NULL);
} else {
while ( !new_mem ) {
if ( ++tries > 10 ) return (NULL);
alpha = Reduce(alpha);
new_len = alpha * *prev_len;
new_mem = (void *) SUPERLU_MALLOC(new_len * lword);
/* new_mem = (void *) calloc(new_len, lword); */
}
}
copy_mem_int(len_to_copy, expanders[type].mem, new_mem);
SUPERLU_FREE (expanders[type].mem);
}
expanders[type].mem = (void *) new_mem;
} else { /* MemModel == USER */
if ( no_expand == 0 ) {
new_mem = user_malloc_dist(new_len * lword, HEAD);
expanders[type].mem = (void *) new_mem;
}
else {
tries = 0;
extra = (new_len - *prev_len) * lword;
if ( keep_prev ) {
if ( StackFull(extra) ) return (NULL);
} else {
while ( StackFull(extra) ) {
if ( ++tries > 10 ) return (NULL);
alpha = Reduce(alpha);
new_len = alpha * *prev_len;
extra = (new_len - *prev_len) * lword;
}
}
 
if ( type != USUB ) {
new_mem = (void*)((char*)expanders[type + 1].mem + extra);
bytes_to_copy = (char*)stack.array + stack.top1
- (char*)expanders[type + 1].mem;
user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy);
 
if ( type < USUB ) {
Glu_freeable->usub = expanders[USUB].mem =
(void*)((char*)expanders[USUB].mem + extra);
}
if ( type < LSUB ) {
Glu_freeable->lsub = expanders[LSUB].mem =
(void*)((char*)expanders[LSUB].mem + extra);
}
stack.top1 += extra;
stack.used += extra;
} /* if ... */
 
} /* else ... */
}
 
expanders[type].size = new_len;
*prev_len = new_len;
if ( no_expand ) ++no_expand;
return (void *) expanders[type].mem;
} /* EXPAND */
 
/*
* mem_usage consists of the following fields:
* - for_lu (float)
* The amount of space used in bytes for the L\U data structures.
* - total (float)
* The amount of space needed in bytes to perform factorization.
* - expansions (int)
* Number of memory expansions during the LU factorization.
*/
/************************************************************************/
int_t QuerySpace_dist(int_t n, int_t lsub_size, Glu_freeable_t *Glu_freeable,
mem_usage_t *mem_usage)
/************************************************************************/
{
register int_t iword = sizeof(int_t);
extern int_t no_expand;
 
/* For the adjacency graphs of L and U. */
/*mem_usage->for_lu = (float)( (4*n + 3) * iword +
Glu_freeable->xlsub[n]*iword );*/
mem_usage->for_lu = (float)( (4*n + 3) * iword +
lsub_size * iword );
mem_usage->for_lu += (float)( (n + 1) * iword +
Glu_freeable->xusub[n]*iword );
 
/* Working storage to support factorization */
mem_usage->total = mem_usage->for_lu + 9*n*iword;
 
mem_usage->expansions = --no_expand;
return 0;
} /* QUERYSPACE_DIST */
 
static int_t
memory_usage(const int_t nzlmax, const int_t nzumax, const int_t n)
{
register int_t iword = sizeof(int_t);
return (10*n*iword + (nzlmax+nzumax)*iword);
}
 
/trunk/OTHER/SuperLU_DIST_2.5/SRC/get_perm_c_parmetis.c
New file
0,0 → 1,905
/*! @file
* \brief Gets matrix permutation
*
* <pre>
* -- Distributed symbolic factorization auxialiary routine (version 2.1) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003
* INRIA France - January 2004
* Laura Grigori
*
* November 1, 2007
* </pre>
*/
 
/* limits.h: the largest positive integer (INT_MAX) */
#include <limits.h>
#include <math.h>
#include "superlu_ddefs.h"
 
/*
* Internal protypes
*/
 
static float
a_plus_at_CompRow_loc
(int, int_t *, int, int_t *, int_t , int_t *, int_t *,
int, int_t *, int_t *, int_t **, int_t **, gridinfo_t *);
 
/*! \brief
*
* <pre>
* Purpose
* =======
*
* GET_PERM_C_PARMETIS obtains a permutation matrix Pc, by applying a
* graph partitioning algorithm to the symmetrized graph A+A'. The
* multilevel graph partitioning algorithm used is the
* ParMETIS_V3_NodeND routine available in the parallel graph
* partitioning package parMETIS.
*
* The number of independent sub-domains noDomains computed by this
* algorithm has to be a power of 2. Hence noDomains is the larger
* number power of 2 that is smaller than nprocs_i, where nprocs_i = nprow
* * npcol is the number of processors used in SuperLU_DIST.
*
* Arguments
* =========
*
* A (input) SuperMatrix*
* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
* of the linear equations is A->nrow. Matrix A is distributed
* in NRformat_loc format.
*
* perm_r (input) int_t*
* Row permutation vector of size A->nrow, which defines the
* permutation matrix Pr; perm_r[i] = j means row i of A is in
* position j in Pr*A.
*
* perm_c (output) int_t*
* Column permutation vector of size A->ncol, which defines the
* permutation matrix Pc; perm_c[i] = j means column i of A is
* in position j in A*Pc.
*
* nprocs_i (input) int*
* Number of processors the input matrix is distributed on in a block
* row format. It corresponds to number of processors used in
* SuperLU_DIST.
*
* noDomains (input) int*, must be power of 2
* Number of independent domains to be computed by the graph
* partitioning algorithm. ( noDomains <= nprocs_i )
*
* sizes (output) int_t**, of size 2 * noDomains
* Returns pointer to an array containing the number of nodes
* for each sub-domain and each separator. Separators are stored
* from left to right.
* Memory for the array is allocated in this routine.
*
* fstVtxSep (output) int_t**, of size 2 * noDomains
* Returns pointer to an array containing first node for each
* sub-domain and each separator.
* Memory for the array is allocated in this routine.
*
* Return value
* ============
* < 0, number of bytes allocated on return from the symbolic factorization.
* > 0, number of bytes allocated when out of memory.
* </pre>
*/
float
get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,
int nprocs_i, int noDomains,
int_t **sizes, int_t **fstVtxSep,
gridinfo_t *grid, MPI_Comm *metis_comm)
 
{
NRformat_loc *Astore;
int iam, p;
int *b_rowptr_int, *b_colind_int, *l_sizes_int, *dist_order_int, *vtxdist_o_int;
int *options, numflag;
int_t m_loc, nnz_loc, fst_row;
int_t m, n, bnz, i, j;
int_t *rowptr, *colind, *l_fstVtxSep, *l_sizes;
int_t *b_rowptr, *b_colind;
int_t *dist_order;
int *recvcnts, *displs;
/* first row index on each processor when the matrix is distributed
on nprocs (vtxdist_i) or noDomains processors (vtxdist_o) */
int_t *vtxdist_i, *vtxdist_o;
int_t szSep, k, noNodes;
float apat_mem_l; /* memory used during the computation of the graph of A+A' */
float mem; /* Memory used during this routine */
MPI_Status status;
 
/* Initialization. */
MPI_Comm_rank (grid->comm, &iam);
n = A->ncol;
m = A->nrow;
if ( m != n ) ABORT("Matrix is not square");
mem = 0.;
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Enter get_perm_c_parmetis()");
#endif
 
Astore = (NRformat_loc *) A->Store;
nnz_loc = Astore->nnz_loc; /* number of nonzeros in the local submatrix */
m_loc = Astore->m_loc; /* number of rows local to this processor */
fst_row = Astore->fst_row; /* global index of the first row */
rowptr = Astore->rowptr; /* pointer to rows and column indices */
colind = Astore->colind;
#if ( PRNTlevel>=1 )
if ( !iam ) printf(".. Use parMETIS ordering on A'+A with %d sub-domains.\n",
noDomains);
#endif
 
numflag = 0;
/* determine first row on each processor */
vtxdist_i = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t));
if ( !vtxdist_i ) ABORT("SUPERLU_MALLOC fails for vtxdist_i.");
vtxdist_o = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t));
if ( !vtxdist_o ) ABORT("SUPERLU_MALLOC fails for vtxdist_o.");
 
MPI_Allgather (&fst_row, 1, mpi_int_t, vtxdist_i, 1, mpi_int_t,
grid->comm);
vtxdist_i[nprocs_i] = m;
 
if (noDomains == nprocs_i) {
/* keep the same distribution of A */
for (p = 0; p <= nprocs_i; p++)
vtxdist_o[p] = vtxdist_i[p];
}
else {
i = n / noDomains;
j = n % noDomains;
for (k = 0, p = 0; p < noDomains; p++) {
vtxdist_o[p] = k;
k += i;
if (p < j) k++;
}
/* The remaining non-participating processors get the same
first-row-number as the last processor. */
for (p = noDomains; p <= nprocs_i; p++)
vtxdist_o[p] = k;
}
 
#if ( DEBUGlevel>=2 )
if (!iam)
PrintInt10 ("vtxdist_o", nprocs_i + 1, vtxdist_o);
#endif
 
/* Compute distributed A + A' */
if ((apat_mem_l =
a_plus_at_CompRow_loc(iam, perm_r, nprocs_i, vtxdist_i,
n, rowptr, colind, noDomains, vtxdist_o,
&bnz, &b_rowptr, &b_colind, grid)) > 0)
return (apat_mem_l);
mem += -apat_mem_l;
/* Initialize and allocate storage for parMetis. */
(*sizes) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t));
if (!(*sizes)) ABORT("SUPERLU_MALLOC fails for sizes.");
l_sizes = *sizes;
(*fstVtxSep) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t));
if (!(*fstVtxSep)) ABORT("SUPERLU_MALLOC fails for fstVtxSep.");
l_fstVtxSep = *fstVtxSep;
m_loc = vtxdist_o[iam+1] - vtxdist_o[iam];
if ( iam < noDomains)
/* dist_order_int is the perm returned by parMetis, distributed */
if (! (dist_order_int = (int *) SUPERLU_MALLOC(m_loc * sizeof(int))))
ABORT("SUPERLU_MALLOC fails for dist_order_int.");
 
/* ParMETIS represents the column pointers and row indices of *
* the input matrix using integers. When SuperLU_DIST uses *
* long int for the int_t type, then several supplementary *
* copies need to be performed in order to call ParMETIS. */
#if defined (_LONGINT)
l_sizes_int = (int *) SUPERLU_MALLOC(2 * noDomains * sizeof(int));
if (!(l_sizes_int)) ABORT("SUPERLU_MALLOC fails for l_sizes_int.");
/* Allocate storage */
if ( !(b_rowptr_int = (int*) SUPERLU_MALLOC((m_loc+1) * sizeof(int))))
ABORT("SUPERLU_MALLOC fails for b_rowptr_int[]");
for (i = 0; i <= m_loc; i++)
b_rowptr_int[i] = b_rowptr[i];
SUPERLU_FREE (b_rowptr);
if ( bnz ) {
if ( !(b_colind_int = (int *) SUPERLU_MALLOC( bnz * sizeof(int))))
ABORT("SUPERLU_MALLOC fails for b_colind_int[]");
for (i = 0; i < bnz; i++)
b_colind_int[i] = b_colind[i];
SUPERLU_FREE (b_colind);
}
if ( !(vtxdist_o_int =
(int *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int))))
ABORT("SUPERLU_MALLOC fails for vtxdist_o_int.");
for (i = 0; i <= nprocs_i; i++)
vtxdist_o_int[i] = vtxdist_o[i];
SUPERLU_FREE (vtxdist_o);
 
#else /* Default */
 
vtxdist_o_int = vtxdist_o;
b_rowptr_int = b_rowptr; b_colind_int = b_colind;
l_sizes_int = l_sizes;
 
#endif
if ( iam < noDomains) {
options = (int *) SUPERLU_MALLOC(4 * sizeof(int));
options[0] = 0;
options[1] = 0;
options[2] = 0;
options[3] = 1;
 
ParMETIS_V3_NodeND(vtxdist_o_int, b_rowptr_int, b_colind_int,
&numflag, options,
dist_order_int, l_sizes_int, metis_comm);
}
if (bnz)
SUPERLU_FREE (b_colind_int);
if ( iam < noDomains) {
SUPERLU_FREE (options);
}
SUPERLU_FREE (b_rowptr_int);
#if defined (_LONGINT)
/* Copy data from dist_order_int to dist_order */
if ( iam < noDomains) {
/* dist_order is the perm returned by parMetis, distributed */
if (!(dist_order = (int_t *) SUPERLU_MALLOC(m_loc * sizeof(int_t))))
ABORT("SUPERLU_MALLOC fails for dist_order.");
for (i = 0; i < m_loc; i++)
dist_order[i] = dist_order_int[i];
SUPERLU_FREE(dist_order_int);
for (i = 0; i < 2*noDomains; i++)
l_sizes[i] = l_sizes_int[i];
SUPERLU_FREE(l_sizes_int);
}
#else
dist_order = dist_order_int;
#endif
/* Allgatherv dist_order to get perm_c */
if (!(displs = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))
ABORT ("SUPERLU_MALLOC fails for displs.");
if ( !(recvcnts = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))
ABORT ("SUPERLU_MALLOC fails for recvcnts.");
for (i = 0; i < nprocs_i; i++)
recvcnts[i] = vtxdist_o_int[i+1] - vtxdist_o_int[i];
displs[0]=0;
for(i=1; i < nprocs_i; i++)
displs[i] = displs[i-1] + recvcnts[i-1];
MPI_Allgatherv (dist_order, m_loc, mpi_int_t, perm_c, recvcnts, displs,
mpi_int_t, grid->comm);
 
if ( iam < noDomains) {
SUPERLU_FREE (dist_order);
}
SUPERLU_FREE (vtxdist_i);
SUPERLU_FREE (vtxdist_o_int);
SUPERLU_FREE (recvcnts);
SUPERLU_FREE (displs);
/* send l_sizes to every processor p >= noDomains */
if (!iam)
for (p = noDomains; p < nprocs_i; p++)
MPI_Send (l_sizes, 2*noDomains, mpi_int_t, p, 0, grid->comm);
if (noDomains <= iam && iam < nprocs_i)
MPI_Recv (l_sizes, 2*noDomains, mpi_int_t, 0, 0, grid->comm,
&status);
/* Determine the first node in each separator, store it in l_fstVtxSep */
for (j = 0; j < 2 * noDomains; j++)
l_fstVtxSep[j] = 0;
l_fstVtxSep[2*noDomains - 2] = l_sizes[2*noDomains - 2];
szSep = noDomains;
i = 0;
while (szSep != 1) {
for (j = i; j < i + szSep; j++) {
l_fstVtxSep[j] += l_sizes[j];
}
for (j = i; j < i + szSep; j++) {
k = i + szSep + (j-i) / 2;
l_fstVtxSep[k] += l_fstVtxSep[j];
}
i += szSep;
szSep = szSep / 2;
}
l_fstVtxSep[2 * noDomains - 2] -= l_sizes[2 * noDomains - 2];
i = 2 * noDomains - 2;
szSep = 1;
while (i > 0) {
for (j = i; j < i + szSep; j++) {
k = (i - 2 * szSep) + (j-i) * 2 + 1;
noNodes = l_fstVtxSep[k];
l_fstVtxSep[k] = l_fstVtxSep[j] - l_sizes[k];
l_fstVtxSep[k-1] = l_fstVtxSep[k] + l_sizes[k] -
noNodes - l_sizes[k-1];
}
szSep *= 2;
i -= szSep;
}
 
#if ( PRNTlevel>=2 )
if (!iam ) {
PrintInt10 ("Sizes of separators", 2 * noDomains-1, l_sizes);
PrintInt10 ("First Vertex Separator", 2 * noDomains-1, l_fstVtxSep);
}
#endif
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit get_perm_c_parmetis()");
#endif
return (-mem);
 
} /* get_perm_c_parmetis */
 
/*! \brief
*
* <pre>
* Purpose
* =======
*
* Form the structure of Pr*A +A'Pr'. A is an n-by-n matrix in
* NRformat_loc format, represented by (rowptr, colind). The output
* B=Pr*A +A'Pr' is in NRformat_loc format (symmetrically, also row
* oriented), represented by (b_rowptr, b_colind).
*
* The input matrix A is distributed in block row format on nprocs_i
* processors. The output matrix B is distributed in block row format
* on nprocs_o processors, where nprocs_o <= nprocs_i. On output, the
* matrix B has its rows permuted according to perm_r.
*
* Sketch of the algorithm
* =======================
*
* Let iam by my process number. Let fst_row, lst_row = m_loc +
* fst_row be the first/last row stored on iam.
*
* Compute Pr' - the inverse row permutation, stored in iperm_r.
*
* Compute the transpose of the block row of Pr*A that iam owns:
* T[:,Pr(fst_row:lst_row)] = Pr' * A[:,fst_row:lst_row] * Pr'
*
*
* All to all communication such that every processor iam receives all
* the blocks of the transpose matrix that it needs, that is
* T[fst_row:lst_row, :]
*
* Compute B = A[fst_row:lst_row, :] + T[fst_row:lst_row, :]
*
* If Pr != I or nprocs_i != nprocs_o then permute the rows of B (that
* is compute Pr*B) and redistribute from nprocs_i to nprocs_o
* according to the block row distribution in vtxdist_i, vtxdist_o.
* </pre>
*/
static float
a_plus_at_CompRow_loc
(
int iam, /* Input - my processor number */
int_t *perm_r, /* Input - row permutation vector Pr */
int nprocs_i, /* Input - number of processors the input matrix
is distributed on */
int_t *vtxdist_i, /* Input - index of first row on each processor of the input matrix */
int_t n, /* Input - number of columns in matrix A. */
int_t *rowptr, /* Input - row pointers of size m_loc+1 for matrix A. */
int_t *colind, /* Input - column indices of size nnz_loc for matrix A. */
int nprocs_o, /* Input - number of processors the output matrix
is distributed on */
int_t *vtxdist_o, /* Input - index of first row on each processor of the output matrix */
int_t *p_bnz, /* Output - on exit, returns the actual number of
local nonzeros in matrix A'+A. */
int_t **p_b_rowptr, /* Output - output matrix, row pointers of size m_loc+1 */
int_t **p_b_colind, /* Output - output matrix, column indices of size *p_bnz */
gridinfo_t *grid /* Input - grid of processors information */
)
{
 
int_t i, j, k, col, num_nz, nprocs;
int_t *tcolind_recv; /* temporary receive buffer */
int_t *tcolind_send; /* temporary send buffer */
int_t sz_tcolind_send, sz_tcolind_loc, sz_tcolind_recv;
int_t ind, ind_tmp, ind_rcv;
int redist_pra; /* TRUE if Pr != I or nprocs_i != nprocs_o */
int_t *marker, *iperm_r;
int_t *sendCnts, *recvCnts;
int_t *sdispls, *rdispls;
int_t bnz, *b_rowptr, *b_colind, bnz_t, *b_rowptr_t, *b_colind_t;
int_t p, t_ind, nelts, ipcol;
int_t m_loc, m_loc_o; /* number of local rows */
int_t fst_row, fst_row_o; /* index of first local row */
int_t nnz_loc; /* number of local nonzeros in matrix A */
float apat_mem, apat_mem_max;
int *intBuf1, *intBuf2, *intBuf3, *intBuf4;
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Enter a_plus_at_CompRow_loc()");
#endif
fst_row = vtxdist_i[iam];
m_loc = vtxdist_i[iam+1] - vtxdist_i[iam];
nnz_loc = rowptr[m_loc];
redist_pra = FALSE;
nprocs = SUPERLU_MAX(nprocs_i, nprocs_o);
apat_mem_max = 0.;
if (!(marker = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t))))
ABORT("SUPERLU_MALLOC fails for marker[]");
if (!(iperm_r = (int_t*) SUPERLU_MALLOC( n * sizeof(int_t))))
ABORT("SUPERLU_MALLOC fails for iperm_r[]");
if (!(sendCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t))))
ABORT("SUPERLU_MALLOC fails for sendCnts[]");
if (!(recvCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t))))
ABORT("SUPERLU_MALLOC fails for recvCnts[]");
if (!(sdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t))))
ABORT("SUPERLU_MALLOC fails for sdispls[]");
if (!(rdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t))))
ABORT("SUPERLU_MALLOC fails for rdispls[]");
apat_mem = 2 * n + 4 * nprocs + 3;
 
#if defined (_LONGINT)
intBuf1 = (int *) SUPERLU_MALLOC(4 * nprocs * sizeof(int));
intBuf2 = intBuf1 + nprocs;
intBuf3 = intBuf1 + 2 * nprocs;
intBuf4 = intBuf1 + 3 * nprocs;
apat_mem += 4*nprocs*sizeof(int) / sizeof(int_t);
#endif
 
/* compute the inverse row permutation vector */
for (i = 0; i < n; i++) {
marker[i] = 1;
if (perm_r[i] != i)
redist_pra = TRUE;
iperm_r[perm_r[i]] = i;
}
 
/* TRANSPOSE LOCAL ROWS ON MY PROCESSOR iam. */
/* THE RESULT IS STORED IN TCOLIND_SEND. */
/* THIS COUNTS FOR TWO PASSES OF THE LOCAL MATRIX. */
 
/* First pass to get counts of each row of T, and set up column pointers */
for (j = 0; j < m_loc; j++) {
for (i = rowptr[j]; i < rowptr[j+1]; i++){
marker[iperm_r[colind[i]]]++;
}
}
/* determine number of elements to be sent to each processor */
for (p = 0; p < nprocs_i; p++) {
sendCnts[p] = 0;
for (i = vtxdist_i[p]; i < vtxdist_i[p+1]; i++)
sendCnts[p] += marker[i];
}
/* exchange send/receive counts information in between all processors */
MPI_Alltoall (sendCnts, 1, mpi_int_t,
recvCnts, 1, mpi_int_t, grid->comm);
sendCnts[iam] = 0;
sz_tcolind_loc = recvCnts[iam];
for (i = 0, j = 0, p = 0; p < nprocs_i; p++) {
rdispls[p] = j;
j += recvCnts[p];
sdispls[p] = i;
i += sendCnts[p];
}
recvCnts[iam] = 0;
sz_tcolind_recv = j;
sz_tcolind_send = i;
/* allocate memory to receive necessary blocks of transpose matrix T */
if (sz_tcolind_recv) {
if ( !(tcolind_recv = (int_t*) SUPERLU_MALLOC( sz_tcolind_recv
* sizeof(int_t) )))
ABORT("SUPERLU_MALLOC fails tcolind_recv[]");
apat_mem += sz_tcolind_recv;
}
/* allocate memory to send blocks of local transpose matrix T to other processors */
if (sz_tcolind_send) {
if (!(tcolind_send = (int_t*) SUPERLU_MALLOC( (sz_tcolind_send)
* sizeof(int_t))))
ABORT("SUPERLU_MALLOC fails for tcolind_send[]");
apat_mem += sz_tcolind_send;
}
 
/* Set up marker[] to point at the beginning of each row in the
send/receive buffer. For each row, we store first its number of
elements, and then the elements. */
ind_rcv = rdispls[iam];
for (p = 0; p < nprocs_i; p++) {
for (i = vtxdist_i[p]; i < vtxdist_i[p+1]; i++) {
nelts = marker[i] - 1;
if (p == iam) {
tcolind_recv[ind_rcv] = nelts;
marker[i] = ind_rcv + 1;
ind_rcv += nelts + 1;
}
else {
tcolind_send[sdispls[p]] = nelts;
marker[i] = sdispls[p] + 1;
sdispls[p] += nelts + 1;
}
}
}
/* reset sdispls vector */
for (i = 0, p = 0; p < nprocs_i; p++) {
sdispls[p] = i;
i += sendCnts[p];
}
/* Second pass of the local matrix A to copy data to be send */
for (j = 0; j < m_loc; j++)
for (i = rowptr[j]; i < rowptr[j+1]; i++) {
col = colind[i];
ipcol = iperm_r[col];
if (ipcol >= fst_row && ipcol < fst_row + m_loc) /* local data */
tcolind_recv[marker[ipcol]] = perm_r[j + fst_row];
else /* remote */
tcolind_send[marker[ipcol]] = perm_r[j + fst_row];
marker[ipcol] ++;
}
sendCnts[iam] = 0;
recvCnts[iam] = 0;
 
#if defined (_LONGINT)
for (p=0; p<nprocs; p++) {
if (sendCnts[p] > INT_MAX || sdispls[p] > INT_MAX ||
recvCnts[p] > INT_MAX || rdispls[p] > INT_MAX)
ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
intBuf1[p] = (int) sendCnts[p];
intBuf2[p] = (int) sdispls[p];
intBuf3[p] = (int) recvCnts[p];
intBuf4[p] = (int) rdispls[p];
}
#else /* Default */
intBuf1 = sendCnts; intBuf2 = sdispls;
intBuf3 = recvCnts; intBuf4 = rdispls;
#endif
/* send/receive transpose matrix T */
MPI_Alltoallv (tcolind_send, intBuf1, intBuf2, mpi_int_t,
tcolind_recv, intBuf3, intBuf4, mpi_int_t,
grid->comm);
/* ------------------------------------------------------------
DEALLOCATE SEND COMMUNICATION STORAGE
------------------------------------------------------------*/
if (sz_tcolind_send) {
SUPERLU_FREE( tcolind_send );
apat_mem_max = apat_mem;
apat_mem -= sz_tcolind_send;
}
 
/* ----------------------------------------------------------------
FOR LOCAL ROWS:
compute B = A + T, where row j of B is:
Struct (B(j,:)) = Struct (A(j,:)) UNION Struct (T(j,:))
do not include the diagonal entry
THIS COUNTS FOR TWO PASSES OF THE LOCAL ROWS OF A AND T.
------------------------------------------------------------------ */
/* Reset marker to EMPTY */
for (i = 0; i < n; ++i) marker[i] = EMPTY;
/* save rdispls information */
for (p = 0; p < nprocs_i; p++)
sdispls[p] = rdispls[p];
 
/* First pass determines number of nonzeros in B */
num_nz = 0;
for (j = 0; j < m_loc; j++) {
/* Flag the diagonal so it's not included in the B matrix */
marker[perm_r[j + fst_row]] = j;
/* Add pattern of row A(j,:) to B(j,:) */
for (i = rowptr[j]; i < rowptr[j+1]; i++) {
k = colind[i];
if ( marker[k] != j ) {
marker[k] = j;
++num_nz;
}
}
/* Add pattern of row T(j,:) to B(j,:) */
for (p = 0; p < nprocs_i; p++) {
t_ind = rdispls[p];
nelts = tcolind_recv[t_ind]; t_ind ++;
for (i = t_ind; i < t_ind + nelts; i++) {
k = tcolind_recv[i];
if ( marker[k] != j ) {
marker[k] = j;
++num_nz;
}
}
t_ind += nelts;
rdispls[p] = t_ind;
}
}
bnz_t = num_nz;
 
/* Allocate storage for B=Pr*A+A'*Pr' */
if ( !(b_rowptr_t = (int_t*) SUPERLU_MALLOC((m_loc+1) * sizeof(int_t))))
ABORT("SUPERLU_MALLOC fails for b_rowptr_t[]");
if ( bnz_t ) {
if ( !(b_colind_t = (int_t*) SUPERLU_MALLOC( bnz_t * sizeof(int_t))))
ABORT("SUPERLU_MALLOC fails for b_colind_t[]");
}
apat_mem += m_loc + 1 + bnz_t;
if (apat_mem > apat_mem_max)
apat_mem_max = apat_mem;
/* Reset marker to EMPTY */
for (i = 0; i < n; i++) marker[i] = EMPTY;
/* restore rdispls information */
for (p = 0; p < nprocs_i; p++)
rdispls[p] = sdispls[p];
/* Second pass, compute each row of B, one at a time */
num_nz = 0;
t_ind = 0;
for (j = 0; j < m_loc; j++) {
b_rowptr_t[j] = num_nz;
/* Flag the diagonal so it's not included in the B matrix */
marker[perm_r[j + fst_row]] = j;
 
/* Add pattern of row A(j,:) to B(j,:) */
for (i = rowptr[j]; i < rowptr[j+1]; i++) {
k = colind[i];
if ( marker[k] != j ) {
marker[k] = j;
b_colind_t[num_nz] = k; num_nz ++;
}
}
 
/* Add pattern of row T(j,:) to B(j,:) */
for (p = 0; p < nprocs_i; p++) {
t_ind = rdispls[p];
nelts = tcolind_recv[t_ind]; t_ind++;
for (i = t_ind; i < t_ind + nelts; i++) {
k = tcolind_recv[i];
if ( marker[k] != j ) {
marker[k] = j;
b_colind_t[num_nz] = k; num_nz++;
}
}
t_ind += nelts;
rdispls[p] = t_ind;
}
}
b_rowptr_t[m_loc] = num_nz;
 
for (p = 0; p <= SUPERLU_MIN(nprocs_i, nprocs_o); p++)
if (vtxdist_i[p] != vtxdist_o[p])
redist_pra = TRUE;
if (sz_tcolind_recv) {
SUPERLU_FREE (tcolind_recv);
apat_mem -= sz_tcolind_recv;
}
SUPERLU_FREE (marker);
SUPERLU_FREE (iperm_r);
apat_mem -= 2 * n + 1;
/* redistribute permuted matrix (by rows) from nproc_i processors
to nproc_o processors */
if (redist_pra) {
m_loc_o = vtxdist_o[iam+1] - vtxdist_o[iam];
fst_row_o = vtxdist_o[iam];
nnz_loc = 0;
if ( !(b_rowptr = intMalloc_dist(m_loc_o + 1)) )
ABORT("Malloc fails for *b_rowptr[].");
apat_mem += m_loc_o + 1;
if (apat_mem > apat_mem_max)
apat_mem_max = apat_mem;
 
for (p = 0; p < nprocs_i; p++) {
sendCnts[p] = 0;
recvCnts[p] = 0;
}
 
for (i = 0; i < m_loc; i++) {
k = perm_r[i+fst_row];
/* find the processor to which row k belongs */
j = FALSE; p = 0;
while (!j) {
if (vtxdist_o[p] <= k && k < vtxdist_o[p+1])
j = TRUE;
else
p ++;
}
if (p == iam) {
b_rowptr[k-fst_row_o] = b_rowptr_t[i + 1] - b_rowptr_t[i];
nnz_loc += b_rowptr[k-fst_row_o];
}
else
sendCnts[p] += b_rowptr_t[i + 1] - b_rowptr_t[i] + 2;
}
/* exchange send/receive counts information in between all processors */
MPI_Alltoall (sendCnts, 1, mpi_int_t,
recvCnts, 1, mpi_int_t, grid->comm);
for (i = 0, j = 0, p = 0; p < nprocs_i; p++) {
rdispls[p] = j;
j += recvCnts[p];
sdispls[p] = i;
i += sendCnts[p];
}
rdispls[p] = j;
sdispls[p] = i;
sz_tcolind_recv = j;
sz_tcolind_send = i;
 
/* allocate memory for local data */
tcolind_recv = NULL;
tcolind_send = NULL;
if (sz_tcolind_recv) {
if ( !(tcolind_recv = (int_t*) SUPERLU_MALLOC( sz_tcolind_recv
* sizeof(int_t) )))
ABORT("SUPERLU_MALLOC fails tcolind_recv[]");
apat_mem += sz_tcolind_recv;
}
/* allocate memory to receive necessary data */
if (sz_tcolind_send) {
if (!(tcolind_send = (int_t*) SUPERLU_MALLOC( (sz_tcolind_send)
* sizeof(int_t))))
ABORT("SUPERLU_MALLOC fails for tcolind_send[]");
apat_mem += sz_tcolind_send;
}
if (apat_mem > apat_mem_max)
apat_mem_max = apat_mem;
 
/* Copy data to be send */
ind_rcv = rdispls[iam];
for (i = 0; i < m_loc; i++) {
k = perm_r[i+fst_row];
/* find the processor to which row k belongs */
j = FALSE; p = 0;
while (!j) {
if (vtxdist_o[p] <= k && k < vtxdist_o[p+1])
j = TRUE;
else
p ++;
}
if (p != iam) { /* remote */
tcolind_send[sdispls[p]] = k;
tcolind_send[sdispls[p]+1] = b_rowptr_t[i+1] - b_rowptr_t[i];
sdispls[p] += 2;
for (j = b_rowptr_t[i]; j < b_rowptr_t[i+1]; j++) {
tcolind_send[sdispls[p]] = b_colind_t[j]; sdispls[p] ++;
}
}
}
/* reset sdispls vector */
for (i = 0, p = 0; p < nprocs_i; p++) {
sdispls[p] = i;
i += sendCnts[p];
}
sendCnts[iam] = 0;
recvCnts[iam] = 0;
#if defined (_LONGINT)
for (p=0; p<nprocs; p++) {
if (sendCnts[p] > INT_MAX || sdispls[p] > INT_MAX ||
recvCnts[p] > INT_MAX || rdispls[p] > INT_MAX)
ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
intBuf1[p] = (int) sendCnts[p];
intBuf2[p] = (int) sdispls[p];
intBuf3[p] = (int) recvCnts[p];
intBuf4[p] = (int) rdispls[p];
}
#else /* Default */
intBuf1 = sendCnts; intBuf2 = sdispls;
intBuf3 = recvCnts; intBuf4 = rdispls;
#endif
 
/* send/receive permuted matrix T by rows */
MPI_Alltoallv (tcolind_send, intBuf1, intBuf2, mpi_int_t,
tcolind_recv, intBuf3, intBuf4, mpi_int_t,
grid->comm);
/* ------------------------------------------------------------
DEALLOCATE COMMUNICATION STORAGE
------------------------------------------------------------*/
if (sz_tcolind_send) {
SUPERLU_FREE( tcolind_send );
apat_mem -= sz_tcolind_send;
}
/* ------------------------------------------------------------
STORE ROWS IN ASCENDING ORDER OF THEIR NUMBER
------------------------------------------------------------*/
for (p = 0; p < nprocs; p++) {
if (p != iam) {
i = rdispls[p];
while (i < rdispls[p+1]) {
j = tcolind_recv[i];
nelts = tcolind_recv[i+1];
i += 2 + nelts;
b_rowptr[j-fst_row_o] = nelts;
nnz_loc += nelts;
}
}
}
 
if (nnz_loc)
if ( !(b_colind = intMalloc_dist(nnz_loc)) ) {
ABORT("Malloc fails for bcolind[].");
apat_mem += nnz_loc;
if (apat_mem > apat_mem_max)
apat_mem_max = apat_mem;
}
 
/* Initialize the array of row pointers */
k = 0;
for (j = 0; j < m_loc_o; j++) {
i = b_rowptr[j];
b_rowptr[j] = k;
k += i;
}
if (m_loc_o) b_rowptr[j] = k;
/* Copy the data into the row oriented storage */
for (p = 0; p < nprocs; p++) {
if (p != iam) {
i = rdispls[p];
while (i < rdispls[p+1]) {
j = tcolind_recv[i];
nelts = tcolind_recv[i+1];
for (i += 2, k = b_rowptr[j-fst_row_o];
k < b_rowptr[j-fst_row_o+1]; i++, k++)
b_colind[k] = tcolind_recv[i];
}
}
}
for (i = 0; i < m_loc; i++) {
k = perm_r[i+fst_row];
if (k >= vtxdist_o[iam] && k < vtxdist_o[iam+1]) {
ind = b_rowptr[k-fst_row_o];
for (j = b_rowptr_t[i]; j < b_rowptr_t[i+1]; j++, ind++)
b_colind[ind] = b_colind_t[j];
}
}
SUPERLU_FREE(b_rowptr_t);
if ( bnz_t )
SUPERLU_FREE(b_colind_t);
if (sz_tcolind_recv)
SUPERLU_FREE(tcolind_recv);
apat_mem -= bnz_t + m_loc + sz_tcolind_recv;
*p_bnz = nnz_loc;
*p_b_rowptr = b_rowptr;
*p_b_colind = b_colind;
}
else {
*p_bnz = bnz_t;
*p_b_rowptr = b_rowptr_t;
*p_b_colind = b_colind_t;
}
SUPERLU_FREE (rdispls);
SUPERLU_FREE (sdispls);
SUPERLU_FREE (sendCnts);
SUPERLU_FREE (recvCnts);
apat_mem -= 4 * nprocs + 2;
#if defined (_LONGINT)
SUPERLU_FREE (intBuf1);
apat_mem -= 4*nprocs*sizeof(int) / sizeof(int_t);
#endif
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit a_plus_at_CompRow_loc()");
#endif
return (- apat_mem_max * sizeof(int_t));
} /* a_plus_at_CompRow_loc */
 
 
Property changes:
Added: svn:executable
+ *
/trunk/OTHER/SuperLU_DIST_2.5/SRC/pdgsequ.c
New file
0,0 → 1,235
 
 
/*! @file
* \brief Computes row and column scalings
*
* File name: pdgsequ.c
* History: Modified from LAPACK routine DGEEQU
*/
#include <math.h>
#include "superlu_ddefs.h"
 
/*! \brief
 
<pre>
Purpose
=======
 
PDGSEQU computes row and column scalings intended to equilibrate an
M-by-N sparse matrix A and reduce its condition number. R returns the row
scale factors and C the column scale factors, chosen to try to make
the largest element in each row and column of the matrix B with
elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
 
R(i) and C(j) are restricted to be between SMLNUM = smallest safe
number and BIGNUM = largest safe number. Use of these scaling
factors is not guaranteed to reduce the condition number of A but
works well in practice.
 
See supermatrix.h for the definition of 'SuperMatrix' structure.
Arguments
=========
 
A (input) SuperMatrix*
The matrix of dimension (A->nrow, A->ncol) whose equilibration
factors are to be computed. The type of A can be:
Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
R (output) double*, size A->nrow
If INFO = 0 or INFO > M, R contains the row scale factors
for A.
C (output) double*, size A->ncol
If INFO = 0, C contains the column scale factors for A.
ROWCND (output) double*
If INFO = 0 or INFO > M, ROWCND contains the ratio of the
smallest R(i) to the largest R(i). If ROWCND >= 0.1 and
AMAX is neither too large nor too small, it is not worth
scaling by R.
COLCND (output) double*
If INFO = 0, COLCND contains the ratio of the smallest
C(i) to the largest C(i). If COLCND >= 0.1, it is not
worth scaling by C.
AMAX (output) double*
Absolute value of largest matrix element. If AMAX is very
close to overflow or very close to underflow, the matrix
should be scaled.
INFO (output) int*
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
> 0: if INFO = i, and i is
<= M: the i-th row of A is exactly zero
> M: the (i-M)-th column of A is exactly zero
 
GRID (input) gridinof_t*
The 2D process mesh.
=====================================================================
</pre>
*/
 
void
pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd,
double *colcnd, double *amax, int_t *info, gridinfo_t *grid)
{
 
/* Local variables */
NRformat_loc *Astore;
double *Aval;
int i, j, irow, jcol, m_loc;
double rcmin, rcmax;
double bignum, smlnum;
extern double dlamch_(char *);
double tempmax, tempmin;
double *loc_max;
int *r_sizes, *displs;
double *loc_r;
int_t procs;
/* Test the input parameters. */
*info = 0;
if ( A->nrow < 0 || A->ncol < 0 ||
A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE )
*info = -1;
if (*info != 0) {
i = -(*info);
xerbla_("pdgsequ", &i);
return;
}
 
/* Quick return if possible */
if ( A->nrow == 0 || A->ncol == 0 ) {
*rowcnd = 1.;
*colcnd = 1.;
*amax = 0.;
return;
}
 
Astore = A->Store;
Aval = Astore->nzval;
m_loc = Astore->m_loc;
/* Get machine constants. */
smlnum = dlamch_("S");
bignum = 1. / smlnum;
 
/* Compute row scale factors. */
for (i = 0; i < A->nrow; ++i) r[i] = 0.;
 
/* Find the maximum element in each row. */
irow = Astore->fst_row;
for (i = 0; i < m_loc; ++i) {
for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j)
r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[j]) );
++irow;
}
 
/* Find the maximum and minimum scale factors. */
rcmin = bignum;
rcmax = 0.;
for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) {
rcmax = SUPERLU_MAX(rcmax, r[i]);
rcmin = SUPERLU_MIN(rcmin, r[i]);
}
/* Get the global MAX and MIN for R */
tempmax = rcmax;
tempmin = rcmin;
MPI_Allreduce( &tempmax, &rcmax,
1, MPI_DOUBLE, MPI_MAX, grid->comm);
MPI_Allreduce( &tempmin, &rcmin,
1, MPI_DOUBLE, MPI_MIN, grid->comm);
 
*amax = rcmax;
 
if (rcmin == 0.) {
/* Find the first zero scale factor and return an error code. */
for (i = 0; i < A->nrow; ++i)
if (r[i] == 0.) {
*info = i + 1;
return;
}
} else {
/* Invert the scale factors. */
for (i = 0; i < A->nrow; ++i)
r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
/* Compute ROWCND = min(R(I)) / max(R(I)) */
*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
}
 
/* Compute column scale factors */
for (j = 0; j < A->ncol; ++j) c[j] = 0.;
 
/* Find the maximum element in each column, assuming the row
scalings computed above. */
irow = Astore->fst_row;
for (i = 0; i < m_loc; ++i) {
for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
jcol = Astore->colind[j];
c[jcol] = SUPERLU_MAX( c[jcol], fabs(Aval[j]) * r[irow] );
}
++irow;
}
 
/* Find the global maximum for c[j] */
if ( !(loc_max = doubleMalloc_dist(A->ncol)))
ABORT("Malloc fails for loc_max[].");
for (j = 0; j < A->ncol; ++j) loc_max[j] = c[j];
MPI_Allreduce(loc_max, c, A->ncol, MPI_DOUBLE, MPI_MAX, grid->comm);
SUPERLU_FREE(loc_max);
 
/* Find the maximum and minimum scale factors. */
rcmin = bignum;
rcmax = 0.;
for (j = 0; j < A->ncol; ++j) {
rcmax = SUPERLU_MAX(rcmax, c[j]);
rcmin = SUPERLU_MIN(rcmin, c[j]);
}
 
if (rcmin == 0.) {
/* Find the first zero scale factor and return an error code. */
for (j = 0; j < A->ncol; ++j)
if ( c[j] == 0. ) {
*info = A->nrow + j + 1;
return;
}
} else {
/* Invert the scale factors. */
for (j = 0; j < A->ncol; ++j)
c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
/* Compute COLCND = min(C(J)) / max(C(J)) */
*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
}
 
/* gather R from each process to get the global R. */
 
procs = grid->nprow * grid->npcol;
if ( !(r_sizes = SUPERLU_MALLOC(2 * procs * sizeof(int))))
ABORT("Malloc fails for r_sizes[].");
displs = r_sizes + procs;
if ( !(loc_r = doubleMalloc_dist(m_loc)))
ABORT("Malloc fails for loc_r[].");
j = Astore->fst_row;
for (i = 0; i < m_loc; ++i) loc_r[i] = r[j++];
 
/* First gather the size of each piece. */
MPI_Allgather(&m_loc, 1, MPI_INT, r_sizes, 1, MPI_INT, grid->comm);
/* Set up the displacements for allgatherv */
displs[0] = 0;
for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1];
 
/* Now gather the actual data */
MPI_Allgatherv(loc_r, m_loc, MPI_DOUBLE, r, r_sizes, displs,
MPI_DOUBLE, grid->comm);
SUPERLU_FREE(r_sizes);
SUPERLU_FREE(loc_r);
 
return;
 
} /* pdgsequ */
/trunk/OTHER/SuperLU_DIST_2.5/SRC/pdsymbfact_distdata.c
New file
0,0 → 1,1961
 
 
/*! @file
* \brief Redistribute the symbolic structure of L and U from the distribution
*
* <pre>
* -- Parallel symbolic factorization auxialiary routine (version 2.3) --
* -- Distributes the data from parallel symbolic factorization
* -- to numeric factorization
* INRIA France - July 1, 2004
* Laura Grigori
*
* November 1, 2007
* Feburary 20, 2008
* October 15, 2008
* </pre>
*/
 
/* limits.h: the largest positive integer (INT_MAX) */
#include <limits.h>
 
#include "superlu_ddefs.h"
#include "psymbfact.h"
 
/*! \brief
*
* <pre>
* Purpose
* =======
*
* Redistribute the symbolic structure of L and U from the distribution
* used in the parallel symbolic factorization step to the distdibution
* used in the parallel numeric factorization step. On exit, the L and U
* structure for the 2D distribution used in the numeric factorization step is
* stored in p_xlsub, p_lsub, p_xusub, p_usub. The global supernodal
* information is also computed and it is stored in Glu_persist->supno
* and Glu_persist->xsup.
*
* This routine allocates memory for storing the structure of L and U
* and the supernodes information. This represents the arrays:
* p_xlsub, p_lsub, p_xusub, p_usub,
* Glu_persist->supno, Glu_persist->xsup.
*
* This routine also deallocates memory allocated during symbolic
* factorization routine. That is, the folloing arrays are freed:
* Pslu_freeable->xlsub, Pslu_freeable->lsub,
* Pslu_freeable->xusub, Pslu_freeable->usub,
* Pslu_freeable->globToLoc, Pslu_freeable->supno_loc,
* Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc.
*
* Arguments
* =========
*
* n (Input) int_t
* Order of the input matrix
* Pslu_freeable (Input) Pslu_freeable_t *
* Local L and U structure,
* global to local indexing information.
*
* Glu_persist (Output) Glu_persist_t *
* Stores on output the information on supernodes mapping.
*
* p_xlsub (Output) int_t **
* Pointer to structure of L distributed on a 2D grid
* of processors, stored by columns.
*
* p_lsub (Output) int_t **
* Structure of L distributed on a 2D grid of processors,
* stored by columns.
*
* p_xusub (Output) int_t **
* Pointer to structure of U distributed on a 2D grid
* of processors, stored by rows.
*
* p_usub (Output) int_t **
* Structure of U distributed on a 2D grid of processors,
* stored by rows.
*
* grid (Input) gridinfo_t*
* The 2D process mesh.
*
* Return value
* ============
* < 0, number of bytes allocated on return from the dist_symbLU.
* > 0, number of bytes allocated in this routine when out of memory.
* (an approximation).
* </pre>
*/
 
static float
dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable,
Glu_persist_t *Glu_persist,
int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub,
gridinfo_t *grid
)
{
int iam, nprocs, pc, pr, p, np, p_diag;
int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u,
*tmp_ptrToSend, *mem;
int_t *nnzToRecv_l, *nnzToRecv_u;
int_t *send_1, *send_2, nsend_1, nsend_2;
int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind;
int_t nsupers, nsupers_i, nsupers_j;
int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc;
int_t maxszsn, maxNvtcsPProc;
int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s;
int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s;
int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n;
int_t *xsub_s, *sub_s, *xsub_n, *sub_n;
int_t *globToLoc, nvtcs_loc;
int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc,
RecvCnt_l, RecvCnt_u, ind_loc;
int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc;
int_t nelts, isize;
float memAux; /* Memory used during this routine and freed on return */
float memRet; /* Memory allocated and not freed on return */
int_t iword, dword;
/* ------------------------------------------------------------
INITIALIZATION.
------------------------------------------------------------*/
iam = grid->iam;
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Enter dist_symbLU()");
#endif
nprocs = (int) grid->nprow * grid->npcol;
xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub;
xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub;
maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
globToLoc = Pslu_freeable->globToLoc;
nvtcs_loc = Pslu_freeable->nvtcs_loc;
xsup_beg_s = Pslu_freeable->xsup_beg_loc;
xsup_end_s = Pslu_freeable->xsup_end_loc;
supno_s = Pslu_freeable->supno_loc;
rcv_luind = NULL;
iword = sizeof(int_t);
dword = sizeof(double);
memAux = 0.; memRet = 0.;
mem = intCalloc_dist(12 * nprocs);
if (!mem)
return (ERROR_RET);
memAux = (float) (12 * nprocs * sizeof(int_t));
nnzToRecv = mem;
nnzToSend = nnzToRecv + 2*nprocs;
nnzToSend_l = nnzToSend + 2 * nprocs;
nnzToSend_u = nnzToSend_l + nprocs;
send_1 = nnzToSend_u + nprocs;
send_2 = send_1 + nprocs;
tmp_ptrToSend = send_2 + nprocs;
nnzToRecv_l = tmp_ptrToSend + nprocs;
nnzToRecv_u = nnzToRecv_l + nprocs;
ptrToSend = nnzToSend;
ptrToRecv = nnzToSend + nprocs;
 
nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int));
intBuf1 = nvtcs + nprocs;
intBuf2 = nvtcs + 2 * nprocs;
intBuf3 = nvtcs + 3 * nprocs;
intBuf4 = nvtcs + 4 * nprocs;
memAux += 5 * nprocs * sizeof(int);
 
maxszsn = sp_ienv_dist(3);
/* Allocate space for storing Glu_persist_n. */
if ( !(supno_n = intMalloc_dist(n+1)) ) {
fprintf (stderr, "Malloc fails for supno_n[].");
return (memAux);
}
memRet += (float) ((n+1) * sizeof(int_t));
 
/* ------------------------------------------------------------
DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION
------------------------------------------------------------*/
if (nvtcs_loc > INT_MAX)
ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n");
intNvtcs_loc = (int) nvtcs_loc;
MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT,
0, grid->comm);
 
if (!iam) {
/* set ptrToRecv to point to the beginning of the data for
each processor */
for (k = 0, p = 0; p < nprocs; p++) {
ptrToRecv[p] = k;
k += nvtcs[p];
}
}
if (nprocs > 1) {
temp = NULL;
if (!iam ) {
if ( !(temp = intMalloc_dist (n+1)) ) {
fprintf (stderr, "Malloc fails for temp[].");
return (memAux + memRet);
}
memAux += (float) (n+1) * iword;
}
#if defined (_LONGINT)
for (p=0; p<nprocs; p++) {
if (ptrToRecv[p] > INT_MAX)
ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
intBuf1[p] = (int) ptrToRecv[p];
}
#else /* Default */
intBuf1 = ptrToRecv;
#endif
MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t,
temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm);
}
else
temp = supno_s;
 
if (!iam) {
nsupers = 0;
p = (int) OWNER( globToLoc[0] );
gb = temp[ptrToRecv[p]];
supno_n[0] = nsupers;
ptrToRecv[p] ++;
szsn = 1;
for (j = 1; j < n; j ++) {
if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) {
nsupers ++;
p = (int) OWNER( globToLoc[j] );
gb = temp[ptrToRecv[p]];
szsn = 1;
}
else {
szsn ++;
}
ptrToRecv[p] ++;
supno_n[j] = nsupers;
}
nsupers++;
if (nprocs > 1) {
SUPERLU_FREE (temp);
memAux -= (float) (n+1) * iword;
}
supno_n[n] = nsupers;
}
 
/* reset to 0 nnzToSend */
for (p = 0; p < 2 *nprocs; p++)
nnzToSend[p] = 0;
MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm);
nsupers = supno_n[n];
/* Allocate space for storing Glu_persist_n. */
if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) {
fprintf (stderr, "Malloc fails for xsup_n[].");
return (memAux + memRet);
}
memRet += (float) (nsupers+1) * iword;
 
/* ------------------------------------------------------------
COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
THEN ALLOCATE SPACE.
THIS ACCOUNTS FOR THE FIRST PASS OF L and U.
------------------------------------------------------------*/
gb = EMPTY;
for (i = 0; i < n; i++) {
if (gb != supno_n[i]) {
/* a new supernode starts */
gb = supno_n[i];
xsup_n[gb] = i;
}
}
xsup_n[nsupers] = n;
for (p = 0; p < nprocs; p++) {
send_1[p] = FALSE;
send_2[p] = FALSE;
}
for (gb_n = 0; gb_n < nsupers; gb_n ++) {
i = xsup_n[gb_n];
if (iam == (int) OWNER( globToLoc[i] )) {
pc = PCOL( gb_n, grid );
pr = PROW( gb_n, grid );
p_diag = PNUM( pr, pc, grid);
i_loc = LOCAL_IND( globToLoc[i] );
gb_s = supno_s[i_loc];
fst_s = xsup_beg_s[gb_s];
lst_s = xsup_end_s[gb_s];
fst_s_l = LOCAL_IND( globToLoc[fst_s] );
for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) {
k = lsub_s[j];
if (k >= i) {
gb = supno_n[k];
p = (int) PNUM( PROW(gb, grid), pc, grid );
nnzToSend[2*p] ++;
send_1[p] = TRUE;
}
}
for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) {
k = usub_s[j];
if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) {
gb = supno_n[k];
p = PNUM( pr, PCOL(gb, grid), grid);
nnzToSend[2*p+1] ++;
send_2[p] = TRUE;
}
}
nsend_2 = 0;
for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
nnzToSend[2*p+1] += 2;
if (send_2[p]) nsend_2 ++;
}
for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++)
if (send_2[p] || p == p_diag) {
if (p == p_diag && !send_2[p])
nnzToSend[2*p+1] += nsend_2;
else
nnzToSend[2*p+1] += nsend_2-1;
send_2[p] = FALSE;
}
nsend_1 = 0;
for (p = pc; p < nprocs; p += grid->npcol) {
nnzToSend[2*p] += 2;
if (send_1[p]) nsend_1 ++;
}
for (p = pc; p < nprocs; p += grid->npcol)
if (send_1[p]) {
nnzToSend[2*p] += nsend_1-1;
send_1[p] = FALSE;
}
else
nnzToSend[2*p] += nsend_1;
}
}
/* All-to-all communication */
MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t,
grid->comm);
nnz_loc_l = nnz_loc_u = 0;
SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0;
for (p = 0; p < nprocs; p++) {
if ( p != iam ) {
SendCnt_l += nnzToSend[2*p]; nnzToSend_l[p] = nnzToSend[2*p];
SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1];
RecvCnt_l += nnzToRecv[2*p]; nnzToRecv_l[p] = nnzToRecv[2*p];
RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1];
} else {
nnz_loc_l += nnzToRecv[2*p];
nnz_loc_u += nnzToRecv[2*p+1];
nnzToSend_l[p] = 0; nnzToSend_u[p] = 0;
nnzToRecv_l[p] = nnzToRecv[2*p];
nnzToRecv_u[p] = nnzToRecv[2*p+1];
}
}
/* Allocate space for storing the symbolic structure after redistribution. */
nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) {
fprintf (stderr, "Malloc fails for xlsub_n[].");
return (memAux + memRet);
}
memRet += (float) (nsupers_j+1) * iword;
 
if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) {
fprintf (stderr, "Malloc fails for xusub_n[].");
return (memAux + memRet);
}
memRet += (float) (nsupers_i+1) * iword;
 
/* Allocate temp storage for sending/receiving the L/U symbolic structure. */
if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) {
if (!(rcv_luind =
intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) {
fprintf (stderr, "Malloc fails for rcv_luind[].");
return (memAux + memRet);
}
memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u)
* iword;
}
if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) {
fprintf (stderr, "Malloc fails for index[].");
return (memAux + memRet);
}
memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
}
/* ------------------------------------------------------------------
LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND.
THIS ACCOUNTS FOR THE SECOND PASS OF L and U.
------------------------------------------------------------------*/
sendL = TRUE;
sendU = FALSE;
while (sendL || sendU) {
if (sendL) {
xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n;
nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l;
}
if (sendU) {
xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n;
nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u;
}
for (i = 0, j = 0, p = 0; p < nprocs; p++) {
if ( p != iam ) {
ptrToSend[p] = i; i += nnzToSend[p];
}
ptrToRecv[p] = j; j += nnzToRecv[p];
}
nnzToRecv[iam] = 0;
ind_loc = ptrToRecv[iam];
for (gb_n = 0; gb_n < nsupers; gb_n++) {
nsend_2 = 0;
i = xsup_n[gb_n];
if (iam == OWNER( globToLoc[i] )) {
pc = PCOL( gb_n, grid );
pr = PROW( gb_n, grid );
p_diag = PNUM( pr, pc, grid );
i_loc = LOCAL_IND( globToLoc[i] );
gb_s = supno_s[i_loc];
fst_s = xsup_beg_s[gb_s];
lst_s = xsup_end_s[gb_s];
fst_s_l = LOCAL_IND( globToLoc[fst_s] );
 
if (sendL) {
p = pc; np = grid->nprow;
} else {
p = pr * grid->npcol; np = grid->npcol;
}
for (j = 0; j < np; j++) {
if (p == iam) {
rcv_luind[ind_loc] = gb_n;
rcv_luind[ind_loc+1] = 0;
tmp_ptrToSend[p] = ind_loc + 1;
ind_loc += 2;
}
else {
snd_luind[ptrToSend[p]] = gb_n;
snd_luind[ptrToSend[p]+1] = 0;
tmp_ptrToSend[p] = ptrToSend[p] + 1;
ptrToSend[p] += 2;
}
if (sendL) p += grid->npcol;
if (sendU) p++;
}
for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) {
k = sub_s[j];
if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) {
gb = supno_n[k];
if (sendL)
p = PNUM( PROW(gb, grid), pc, grid );
else
p = PNUM( pr, PCOL(gb, grid), grid);
if (send_1[p] == FALSE) {
send_1[p] = TRUE;
send_2[nsend_2] = k; nsend_2 ++;
}
if (p == iam) {
rcv_luind[ind_loc] = k; ind_loc++;
if (sendL)
xsub_n[LBj( gb_n, grid )] ++;
else
xsub_n[LBi( gb_n, grid )] ++;
}
else {
snd_luind[ptrToSend[p]] = k;
ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
}
}
}
if (sendL)
for (p = pc; p < nprocs; p += grid->npcol) {
for (k = 0; k < nsend_2; k++) {
gb = supno_n[send_2[k]];
if (PNUM(PROW(gb, grid), pc, grid) != p) {
if (p == iam) {
rcv_luind[ind_loc] = send_2[k]; ind_loc++;
xsub_n[LBj( gb_n, grid )] ++;
}
else {
snd_luind[ptrToSend[p]] = send_2[k];
ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
}
}
}
send_1[p] = FALSE;
}
if (sendU)
for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) {
if (send_1[p] || p == p_diag) {
for (k = 0; k < nsend_2; k++) {
gb = supno_n[send_2[k]];
if(PNUM( pr, PCOL(gb, grid), grid) != p) {
if (p == iam) {
rcv_luind[ind_loc] = send_2[k]; ind_loc++;
xsub_n[LBi( gb_n, grid )] ++;
}
else {
snd_luind[ptrToSend[p]] = send_2[k];
ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++;
}
}
}
send_1[p] = FALSE;
}
}
}
}
/* reset ptrToSnd to point to the beginning of the data for
each processor (structure needed in MPI_Alltoallv) */
for (i = 0, p = 0; p < nprocs; p++) {
ptrToSend[p] = i; i += nnzToSend[p];
}
 
/* ------------------------------------------------------------
PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
Note: it uses MPI_Alltoallv.
------------------------------------------------------------*/
if (nprocs > 1) {
#if defined (_LONGINT)
nnzToSend[iam] = 0;
for (p=0; p<nprocs; p++) {
if (nnzToSend[p] > INT_MAX || ptrToSend[p] > INT_MAX ||
nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX)
ABORT("ERROR in dist_symbLU size to send > INT_MAX\n");
intBuf1[p] = (int) nnzToSend[p];
intBuf2[p] = (int) ptrToSend[p];
intBuf3[p] = (int) nnzToRecv[p];
intBuf4[p] = (int) ptrToRecv[p];
}
#else /* Default */
intBuf1 = nnzToSend; intBuf2 = ptrToSend;
intBuf3 = nnzToRecv; intBuf4 = ptrToRecv;
#endif
 
MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t,
rcv_luind, intBuf3, intBuf4, mpi_int_t,
grid->comm);
}
if (sendL)
nnzToRecv[iam] = nnz_loc_l;
else
nnzToRecv[iam] = nnz_loc_u;
/* ------------------------------------------------------------
DEALLOCATE TEMPORARY STORAGE.
-------------------------------------------------------------*/
if (sendU)
if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) {
SUPERLU_FREE (snd_luind);
memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword;
}
/* ------------------------------------------------------------
CONVERT THE FORMAT.
------------------------------------------------------------*/
/* Initialize the array of column of L/ row of U pointers */
k = 0;
for (p = 0; p < nprocs; p ++) {
if (p != iam) {
i = k;
while (i < k + nnzToRecv[p]) {
gb = rcv_luind[i];
nelts = rcv_luind[i+1];
if (sendL)
xsub_n[LBj( gb, grid )] = nelts;
else
xsub_n[LBi( gb, grid )] = nelts;
i += nelts + 2;
}
}
k += nnzToRecv[p];
}
 
if (sendL) j = nsupers_j;
else j = nsupers_i;
k = 0;
isize = xsub_n[0];
xsub_n[0] = 0;
for (gb_l = 1; gb_l < j; gb_l++) {
k += isize;
isize = xsub_n[gb_l];
xsub_n[gb_l] = k;
}
xsub_n[gb_l] = k + isize;
nnz_loc = xsub_n[gb_l];
if (sendL) {
lsub_n = NULL;
if (nnz_loc) {
if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) {
fprintf (stderr, "Malloc fails for lsub_n[].");
return (memAux + memRet);
}
memRet += (float) (nnz_loc * iword);
}
sub_n = lsub_n;
}
if (sendU) {
usub_n = NULL;
if (nnz_loc) {
if ( !(usub_n = intMalloc_dist(nnz_loc)) ) {
fprintf (stderr, "Malloc fails for usub_n[].");
return (memAux + memRet);
}
memRet += (float) (nnz_loc * iword);
}
sub_n = usub_n;
}
/* Copy the data into the L column / U row oriented storage */
k = 0;
for (p = 0; p < nprocs; p++) {
i = k;
while (i < k + nnzToRecv[p]) {
gb = rcv_luind[i];
if (gb >= nsupers)
printf ("Pe[%d] p %d gb %d nsupers %d i %d i-k %d\n",
iam, p, gb, nsupers, i, i-k);
i += 2;
if (sendL) gb_l = LBj( gb, grid );
if (sendU) gb_l = LBi( gb, grid );
for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) {
sub_n[j] = rcv_luind[i];
}
}
k += nnzToRecv[p];
}
if (sendL) {
sendL = FALSE; sendU = TRUE;
}
else
sendU = FALSE;
}
 
/* deallocate memory allocated during symbolic factorization routine */
if (rcv_luind != NULL) {
SUPERLU_FREE (rcv_luind);
memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword;
}
SUPERLU_FREE (mem);
memAux -= (float) (12 * nprocs * iword);
SUPERLU_FREE(nvtcs);
memAux -= (float) (5 * nprocs * sizeof(int));
if (xlsub_s != NULL) {
SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s);
}
if (xusub_s != NULL) {
SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s);
}
SUPERLU_FREE (globToLoc);
if (supno_s != NULL) {
SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s);
SUPERLU_FREE (supno_s);
}
Glu_persist->supno = supno_n; Glu_persist->xsup = xsup_n;
*p_xlsub = xlsub_n; *p_lsub = lsub_n;
*p_xusub = xusub_n; *p_usub = usub_n;
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit dist_symbLU()");
#endif
return (-memRet);
}
/*! \brief
*
* <pre>
* Purpose
* =======
* Re-distribute A on the 2D process mesh. The lower part is
* stored using a column format and the upper part
* is stored using a row format.
*
* Arguments
* =========
*
* A (Input) SuperMatrix*
* The distributed input matrix A of dimension (A->nrow, A->ncol).
* The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
*
* ScalePermstruct (Input) ScalePermstruct_t*
* The data structure to store the scaling and permutation vectors
* describing the transformations performed to the original matrix A.
*
* Glu_persist (Input) Glu_persist_t *
* Information on supernodes mapping.
*
* grid (Input) gridinfo_t*
* The 2D process mesh.
*
* p_ainf_colptr (Output) int_t**
* Pointer to the lower part of A distributed on a 2D grid
* of processors, stored by columns.
*
* p_ainf_rowind (Output) int_t**
* Structure of of the lower part of A distributed on a
* 2D grid of processors, stored by columns.
*
* p_ainf_val (Output) double**
* Numerical values of the lower part of A, distributed on a
* 2D grid of processors, stored by columns.
*
* p_asup_rowptr (Output) int_t**
* Pointer to the upper part of A distributed on a 2D grid
* of processors, stored by rows.
*
* p_asup_colind (Output) int_t**
* Structure of of the upper part of A distributed on a
* 2D grid of processors, stored by rows.
*
* p_asup_val (Output) double**
* Numerical values of the upper part of A, distributed on a
* 2D grid of processors, stored by rows.
*
* ilsum_i (Input) int_t *
* Starting position of each supernode in
* the full array (local, block row wise).
*
* ilsum_j (Input) int_t *
* Starting position of each supernode in
* the full array (local, block column wise).
*
* Return value
* ============
* < 0, number of bytes allocated on return from the dist_symbLU
* > 0, number of bytes allocated when out of memory.
* (an approximation).
* </pre>
*/
static float
ddist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct,
Glu_persist_t *Glu_persist, gridinfo_t *grid,
int_t **p_ainf_colptr, int_t **p_ainf_rowind, double **p_ainf_val,
int_t **p_asup_rowptr, int_t **p_asup_colind, double **p_asup_val,
int_t *ilsum_i, int_t *ilsum_j
)
{
int iam, p, procs;
NRformat_loc *Astore;
int_t *perm_r; /* row permutation vector */
int_t *perm_c; /* column permutation vector */
int_t i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize;
int_t nsupers, nsupers_i, nsupers_j;
int_t nnz_loc, nnz_loc_ainf, nnz_loc_asup; /* number of local nonzeros */
int_t nnz_remote; /* number of remote nonzeros to be sent */
int_t SendCnt; /* number of remote nonzeros to be sent */
int_t RecvCnt; /* number of remote nonzeros to be sent */
int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
double *asup_val, *ainf_val;
int_t *nnzToSend, *nnzToRecv, maxnnzToRecv;
int_t *ia, *ja, **ia_send, *index, *itemp;
int_t *ptr_to_send;
double *aij, **aij_send, *nzval, *dtemp;
double *nzval_a;
MPI_Request *send_req;
MPI_Status status;
int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */
int_t *supno = Glu_persist->supno;
float memAux; /* Memory used during this routine and freed on return */
float memRet; /* Memory allocated and not freed on return */
int_t iword, dword, szbuf;
 
/* ------------------------------------------------------------
INITIALIZATION.
------------------------------------------------------------*/
iam = grid->iam;
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Enter ddist_A()");
#endif
iword = sizeof(int_t);
dword = sizeof(double);
perm_r = ScalePermstruct->perm_r;
perm_c = ScalePermstruct->perm_c;
procs = grid->nprow * grid->npcol;
Astore = (NRformat_loc *) A->Store;
n = A->ncol;
m_loc = Astore->m_loc;
fst_row = Astore->fst_row;
if (!(nnzToRecv = intCalloc_dist(2*procs))) {
fprintf (stderr, "Malloc fails for nnzToRecv[].");
return (ERROR_RET);
}
memAux = (float) (2 * procs * iword);
memRet = 0.;
nnzToSend = nnzToRecv + procs;
nsupers = supno[n-1] + 1;
 
/* ------------------------------------------------------------
COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS,
THEN ALLOCATE SPACE.
THIS ACCOUNTS FOR THE FIRST PASS OF A.
------------------------------------------------------------*/
for (i = 0; i < m_loc; ++i) {
for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */
jcol = Astore->colind[j];
gbi = BlockNum( irow );
gbj = BlockNum( jcol );
p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
++nnzToSend[p];
}
}
/* All-to-all communication */
MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t,
grid->comm);
maxnnzToRecv = 0;
nnz_loc = SendCnt = RecvCnt = 0;
for (p = 0; p < procs; ++p) {
if ( p != iam ) {
SendCnt += nnzToSend[p];
RecvCnt += nnzToRecv[p];
maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv );
} else {
nnz_loc += nnzToRecv[p];
/*assert(nnzToSend[p] == nnzToRecv[p]);*/
}
}
k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
szbuf = k;
 
/* Allocate space for storing the triplets after redistribution. */
if ( !(ia = intMalloc_dist(2*k)) ) {
fprintf (stderr, "Malloc fails for ia[].");
return (memAux);
}
memAux += (float) (2*k*iword);
ja = ia + k;
if ( !(aij = doubleMalloc_dist(k)) ) {
fprintf (stderr, "Malloc fails for aij[].");
return (memAux);
}
memAux += (float) (k*dword);
/* Allocate temporary storage for sending/receiving the A triplets. */
if ( procs > 1 ) {
if ( !(send_req = (MPI_Request *)
SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) {
fprintf (stderr, "Malloc fails for send_req[].");
return (memAux);
}
memAux += (float) (2*procs *sizeof(MPI_Request));
if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) {
fprintf(stderr, "Malloc fails for ia_send[].");
return (memAux);
}
memAux += (float) (procs*sizeof(int_t*));
if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) {
fprintf(stderr, "Malloc fails for aij_send[].");
return (memAux);
}
memAux += (float) (procs*sizeof(double*));
if ( !(index = intMalloc_dist(2*SendCnt)) ) {
fprintf(stderr, "Malloc fails for index[].");
return (memAux);
}
memAux += (float) (2*SendCnt*iword);
if ( !(nzval = doubleMalloc_dist(SendCnt)) ) {
fprintf(stderr, "Malloc fails for nzval[].");
return (memAux);
}
memAux += (float) (SendCnt * dword);
if ( !(ptr_to_send = intCalloc_dist(procs)) ) {
fprintf(stderr, "Malloc fails for ptr_to_send[].");
return (memAux);
}
memAux += (float) (procs * iword);
if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) {
fprintf(stderr, "Malloc fails for itemp[].");
return (memAux);
}
memAux += (float) (2*maxnnzToRecv*iword);
if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) {
fprintf(stderr, "Malloc fails for dtemp[].");
return (memAux);
}
memAux += (float) (maxnnzToRecv * dword);
for (i = 0, j = 0, p = 0; p < procs; ++p) {
if ( p != iam ) {
ia_send[p] = &index[i];
i += 2 * nnzToSend[p]; /* ia/ja indices alternate */
aij_send[p] = &nzval[j];
j += nnzToSend[p];
}
}
} /* if procs > 1 */
nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) {
fprintf (stderr, "Malloc fails for *ainf_colptr[].");
return (memAux);
}
memRet += (float) (ilsum_j[nsupers_j] + 1) * iword;
if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) {
fprintf (stderr, "Malloc fails for *asup_rowptr[].");
return (memAux+memRet);
}
memRet += (float) (ilsum_i[nsupers_i] + 1) * iword;
/* ------------------------------------------------------------
LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND.
THIS ACCOUNTS FOR THE SECOND PASS OF A.
------------------------------------------------------------*/
nnz_loc = 0; /* Reset the local nonzero count. */
nnz_loc_ainf = nnz_loc_asup = 0;
nzval_a = Astore->nzval;
for (i = 0; i < m_loc; ++i) {
for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */
jcol = Astore->colind[j];
gbi = BlockNum( irow );
gbj = BlockNum( jcol );
p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid );
if ( p != iam ) { /* remote */
k = ptr_to_send[p];
ia_send[p][k] = irow;
ia_send[p][k + nnzToSend[p]] = jcol;
aij_send[p][k] = nzval_a[j];
++ptr_to_send[p];
} else { /* local */
ia[nnz_loc] = irow;
ja[nnz_loc] = jcol;
aij[nnz_loc] = nzval_a[j];
++nnz_loc;
/* Count nonzeros in each column of L / row of U */
if (gbi >= gbj) {
ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
nnz_loc_ainf ++;
}
else {
asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
nnz_loc_asup ++;
}
}
}
}
 
/* ------------------------------------------------------------
PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION.
NOTE: Can possibly use MPI_Alltoallv.
------------------------------------------------------------*/
for (p = 0; p < procs; ++p) {
if ( p != iam ) {
it = 2*nnzToSend[p];
MPI_Isend( ia_send[p], it, mpi_int_t,
p, iam, grid->comm, &send_req[p] );
it = nnzToSend[p];
MPI_Isend( aij_send[p], it, MPI_DOUBLE,
p, iam+procs, grid->comm, &send_req[procs+p] );
}
}
for (p = 0; p < procs; ++p) {
if ( p != iam ) {
it = 2*nnzToRecv[p];
MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status );
it = nnzToRecv[p];
MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs,
grid->comm, &status );
for (i = 0; i < nnzToRecv[p]; ++i) {
ia[nnz_loc] = itemp[i];
irow = itemp[i];
jcol = itemp[i + nnzToRecv[p]];
/* assert(jcol<n); */
ja[nnz_loc] = jcol;
aij[nnz_loc] = dtemp[i];
++nnz_loc;
gbi = BlockNum( irow );
gbj = BlockNum( jcol );
/* Count nonzeros in each column of L / row of U */
if (gbi >= gbj) {
ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++;
nnz_loc_ainf ++;
}
else {
asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++;
nnz_loc_asup ++;
}
}
}
}
for (p = 0; p < procs; ++p) {
if ( p != iam ) {
MPI_Wait( &send_req[p], &status);
MPI_Wait( &send_req[procs+p], &status);
}
}
/* ------------------------------------------------------------
DEALLOCATE TEMPORARY STORAGE
------------------------------------------------------------*/
SUPERLU_FREE(nnzToRecv);
memAux -= 2 * procs * iword;
if ( procs > 1 ) {
SUPERLU_FREE(send_req);
SUPERLU_FREE(ia_send);
SUPERLU_FREE(aij_send);
SUPERLU_FREE(index);
SUPERLU_FREE(nzval);
SUPERLU_FREE(ptr_to_send);
SUPERLU_FREE(itemp);
SUPERLU_FREE(dtemp);
memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) +
procs*sizeof(double*) + 2*SendCnt * iword +
SendCnt* dword + procs*iword +
2*maxnnzToRecv*iword + maxnnzToRecv*dword;
}
/* ------------------------------------------------------------
CONVERT THE TRIPLET FORMAT.
------------------------------------------------------------*/
if (nnz_loc_ainf != 0) {
if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) {
fprintf (stderr, "Malloc fails for *ainf_rowind[].");
return (memAux+memRet);
}
memRet += (float) (nnz_loc_ainf * iword);
if ( !(ainf_val = doubleMalloc_dist(nnz_loc_ainf)) ) {
fprintf (stderr, "Malloc fails for *ainf_val[].");
return (memAux+memRet);
}
memRet += (float) (nnz_loc_ainf * dword);
}
else {
ainf_rowind = NULL;
ainf_val = NULL;
}
if (nnz_loc_asup != 0) {
if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) {
fprintf (stderr, "Malloc fails for *asup_colind[].");
return (memAux + memRet);
}
memRet += (float) (nnz_loc_asup * iword);
if ( !(asup_val = doubleMalloc_dist(nnz_loc_asup)) ) {
fprintf (stderr, "Malloc fails for *asup_val[].");
return (memAux + memRet);
}
memRet += (float) (nnz_loc_asup * dword);
}
else {
asup_colind = NULL;
asup_val = NULL;
}
 
/* Initialize the array of column pointers */
k = 0;
jsize = ainf_colptr[0]; ainf_colptr[0] = 0;
for (j = 1; j < ilsum_j[nsupers_j]; j++) {
k += jsize;
jsize = ainf_colptr[j];
ainf_colptr[j] = k;
}
ainf_colptr[ilsum_j[nsupers_j]] = k + jsize;
i = 0;
isize = asup_rowptr[0]; asup_rowptr[0] = 0;
for (j = 1; j < ilsum_i[nsupers_i]; j++) {
i += isize;
isize = asup_rowptr[j];
asup_rowptr[j] = i;
}
asup_rowptr[ilsum_i[nsupers_i]] = i + isize;
 
/* Copy the triplets into the column oriented storage */
for (i = 0; i < nnz_loc; ++i) {
jcol = ja[i];
irow = ia[i];
gbi = BlockNum( irow );
gbj = BlockNum( jcol );
/* Count nonzeros in each column of L / row of U */
if (gbi >= gbj) {
j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj );
k = ainf_colptr[j];
ainf_rowind[k] = irow;
ainf_val[k] = aij[i];
ainf_colptr[j] ++;
}
else {
j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi );
k = asup_rowptr[j];
asup_colind[k] = jcol;
asup_val[k] = aij[i];
asup_rowptr[j] ++;
}
}
 
/* Reset the column pointers to the beginning of each column */
for (j = ilsum_j[nsupers_j]; j > 0; j--)
ainf_colptr[j] = ainf_colptr[j-1];
for (j = ilsum_i[nsupers_i]; j > 0; j--)
asup_rowptr[j] = asup_rowptr[j-1];
ainf_colptr[0] = 0;
asup_rowptr[0] = 0;
SUPERLU_FREE(ia);
SUPERLU_FREE(aij);
memAux -= 2*szbuf*iword + szbuf*dword;
*p_ainf_colptr = ainf_colptr;
*p_ainf_rowind = ainf_rowind;
*p_ainf_val = ainf_val;
*p_asup_rowptr = asup_rowptr;
*p_asup_colind = asup_colind;
*p_asup_val = asup_val;
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit ddist_A()");
fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6);
#endif
 
return (-memRet);
} /* dist_A */
 
/*! \brief
*
* <pre>
* Purpose
* =======
* Distribute the input matrix onto the 2D process mesh.
*
* Arguments
* =========
*
* fact (input) fact_t
* Specifies whether or not the L and U structures will be re-used.
* = SamePattern_SameRowPerm: L and U structures are input, and
* unchanged on exit.
* This routine should not be called for this case, an error
* is generated. Instead, pddistribute routine should be called.
* = DOFACT or SamePattern: L and U structures are computed and output.
*
* n (Input) int
* Dimension of the matrix.
*
* A (Input) SuperMatrix*
* The distributed input matrix A of dimension (A->nrow, A->ncol).
* A may be overwritten by diag(R)*A*diag(C)*Pc^T.
* The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE.
*
* ScalePermstruct (Input) ScalePermstruct_t*
* The data structure to store the scaling and permutation vectors
* describing the transformations performed to the original matrix A.
*
* Glu_freeable (Input) *Glu_freeable_t
* The global structure describing the graph of L and U.
*
* LUstruct (Input) LUstruct_t*
* Data structures for L and U factors.
*
* grid (Input) gridinfo_t*
* The 2D process mesh.
*
* Return value
* ============
* < 0, number of bytes allocated on return from the dist_symbLU
* > 0, number of bytes allocated for performing the distribution
* of the data, when out of memory.
* (an approximation).
* </pre>
*/
 
float
ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A,
ScalePermstruct_t *ScalePermstruct,
Pslu_freeable_t *Pslu_freeable,
LUstruct_t *LUstruct, gridinfo_t *grid)
{
Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
Glu_freeable_t Glu_freeable_n;
LocalLU_t *Llu = LUstruct->Llu;
int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k,
len, len1, nsupc, nsupc_gb, ii, nprocs;
int_t ljb; /* local block column number */
int_t nrbl; /* number of L blocks in current block column */
int_t nrbu; /* number of U blocks in current block column */
int_t gb; /* global block number; 0 < gb <= nsuper */
int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
int iam, jbrow, jbcol, jcol, kcol, mycol, myrow, pc, pr, ljb_i, ljb_j, p;
int_t mybufmax[NBUFFERS];
NRformat_loc *Astore;
double *a;
int_t *asub, *xa;
int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind;
double *asup_val, *ainf_val;
int_t *xsup, *supno; /* supernode and column mapping */
int_t *lsub, *xlsub, *usub, *xusub;
int_t nsupers, nsupers_i, nsupers_j, nsupers_ij;
int_t next_ind; /* next available position in index[*] */
int_t next_val; /* next available position in nzval[*] */
int_t *index; /* indices consist of headers and row subscripts */
double *lusup, *uval; /* nonzero values in L and U */
int_t *recvBuf;
int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */
int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
/*-- Counts to be used in factorization. --*/
int_t *ToRecv, *ToSendD, **ToSendR;
/*-- Counts to be used in lower triangular solve. --*/
int_t *fmod; /* Modification count for L-solve. */
int_t **fsendx_plist; /* Column process list to send down Xk. */
int_t nfrecvx = 0; /* Number of Xk I will receive. */
int_t nfsendx = 0; /* Number of Xk I will send */
int_t kseen;
/*-- Counts to be used in upper triangular solve. --*/
int_t *bmod; /* Modification count for U-solve. */
int_t **bsendx_plist; /* Column process list to send down Xk. */
int_t nbrecvx = 0; /* Number of Xk I will receive. */
int_t nbsendx = 0; /* Number of Xk I will send */
int_t *ilsum; /* starting position of each supernode in
the full array (local) */
int_t *ilsum_j, ldaspa_j; /* starting position of each supernode in
the full array (local, block column wise) */
/*-- Auxiliary arrays; freed on return --*/
int_t *Urb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */
int_t *LUb_length; /* L,U block length; size nsupers_ij */
int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */
int_t *LUb_number; /* global block number; size nsupers_ij */
int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc) */
int_t *Lrb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */
double *dense, *dense_col; /* SPA */
double zero = 0.0;
int_t ldaspa; /* LDA of SPA */
int_t iword, dword;
float memStrLU, memA,
memDist = 0.; /* memory used for redistributing the data, which does
not include the memory for the numerical values
of L and U (positive number)*/
float memNLU = 0.; /* memory allocated for storing the numerical values of
L and U, that will be used in the numeric
factorization (positive number) */
 
#if ( PRNTlevel>=1 )
int_t nLblocks = 0, nUblocks = 0;
#endif
/* Initialization. */
iam = grid->iam;
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Enter dist_psymbtonum()");
#endif
myrow = MYROW( iam, grid );
mycol = MYCOL( iam, grid );
nprocs = grid->npcol * grid->nprow;
for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
Astore = (NRformat_loc *) A->Store;
iword = sizeof(int_t);
dword = sizeof(double);
 
if (fact == SamePattern_SameRowPerm) {
ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm.");
}
 
if ((memStrLU =
dist_symbLU (n, Pslu_freeable,
Glu_persist, &xlsub, &lsub, &xusub, &usub, grid)) > 0)
return (memStrLU);
memDist += (-memStrLU);
xsup = Glu_persist->xsup; /* supernode and column mapping */
supno = Glu_persist->supno;
nsupers = supno[n-1] + 1;
nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */
nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */
nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j);
if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) {
fprintf (stderr, "Malloc fails for ilsum[].");
return (memDist + memNLU);
}
memNLU += (nsupers_i+1) * iword;
if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) {
fprintf (stderr, "Malloc fails for ilsum_j[].");
return (memDist + memNLU);
}
memDist += (nsupers_j+1) * iword;
 
/* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */
ilsum[0] = 0;
ldaspa = 0;
for (gb = 0; gb < nsupers; gb++)
if ( myrow == PROW( gb, grid ) ) {
i = SuperSize( gb );
ldaspa += i;
lb = LBi( gb, grid );
ilsum[lb + 1] = ilsum[lb] + i;
}
ilsum[nsupers_i] = ldaspa;
 
ldaspa_j = 0; ilsum_j[0] = 0;
for (gb = 0; gb < nsupers; gb++)
if (mycol == PCOL( gb, grid )) {
i = SuperSize( gb );
ldaspa_j += i;
lb = LBj( gb, grid );
ilsum_j[lb + 1] = ilsum_j[lb] + i;
}
ilsum_j[nsupers_j] = ldaspa_j;
if ((memA = ddist_A(A, ScalePermstruct, Glu_persist,
grid, &ainf_colptr, &ainf_rowind, &ainf_val,
&asup_rowptr, &asup_colind, &asup_val,
ilsum, ilsum_j)) > 0)
return (memDist + memA + memNLU);
memDist += (-memA);
 
/* ------------------------------------------------------------
FIRST TIME CREATING THE L AND U DATA STRUCTURES.
------------------------------------------------------------*/
/* We first need to set up the L and U data structures and then
* propagate the values of A into them.
*/
if ( !(ToRecv = intCalloc_dist(nsupers)) ) {
fprintf(stderr, "Calloc fails for ToRecv[].");
return (memDist + memNLU);
}
memNLU += nsupers * iword;
k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) {
fprintf(stderr, "Malloc fails for ToSendR[].");
return (memDist + memNLU);
}
memNLU += k*sizeof(int_t*);
j = k * grid->npcol;
if ( !(index = intMalloc_dist(j)) ) {
fprintf(stderr, "Malloc fails for index[].");
return (memDist + memNLU);
}
memNLU += j*iword;
for (i = 0; i < j; ++i) index[i] = EMPTY;
for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j];
/* Auxiliary arrays used to set up L and U block data structures.
They are freed on return. */
if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) {
fprintf(stderr, "Calloc fails for LUb_length[].");
return (memDist + memNLU);
}
if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) {
fprintf(stderr, "Malloc fails for LUb_indptr[].");
return (memDist + memNLU);
}
if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) {
fprintf(stderr, "Calloc fails for LUb_number[].");
return (memDist + memNLU);
}
if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) {
fprintf(stderr, "Calloc fails for LUb_valptr[].");
return (memDist + memNLU);
}
memDist += 4 * nsupers_ij * iword;
k = CEILING( nsupers, grid->nprow );
/* Pointers to the beginning of each block row of U. */
if ( !(Unzval_br_ptr =
(double**)SUPERLU_MALLOC(nsupers_i * sizeof(double*))) ) {
fprintf(stderr, "Malloc fails for Unzval_br_ptr[].");
return (memDist + memNLU);
}
if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) {
fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[].");
return (memDist + memNLU);
}
memNLU += nsupers_i*sizeof(double*) + nsupers_i*sizeof(int_t*);
Unzval_br_ptr[nsupers_i-1] = NULL;
Ufstnz_br_ptr[nsupers_i-1] = NULL;
 
if ( !(ToSendD = intCalloc_dist(nsupers_i)) ) {
fprintf(stderr, "Malloc fails for ToSendD[].");
return (memDist + memNLU);
}
memNLU += nsupers_i*iword;
if ( !(Urb_marker = intCalloc_dist(nsupers_j))) {
fprintf(stderr, "Calloc fails for rb_marker[].");
return (memDist + memNLU);
}
if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) {
fprintf(stderr, "Calloc fails for rb_marker[].");
return (memDist + memNLU);
}
memDist += (nsupers_i + nsupers_j)*iword;
/* Auxiliary arrays used to set up L, U block data structures.
They are freed on return.
k is the number of local row blocks. */
if ( !(dense = doubleCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j)
* sp_ienv_dist(3))) ) {
fprintf(stderr, "Calloc fails for SPA dense[].");
return (memDist + memNLU);
}
/* These counts will be used for triangular solves. */
if ( !(fmod = intCalloc_dist(nsupers_i)) ) {
fprintf(stderr, "Calloc fails for fmod[].");
return (memDist + memNLU);
}
if ( !(bmod = intCalloc_dist(nsupers_i)) ) {
fprintf(stderr, "Calloc fails for bmod[].");
return (memDist + memNLU);
}
/* ------------------------------------------------ */
memNLU += 2*nsupers_i*iword +
SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword;
/* Pointers to the beginning of each block column of L. */
if ( !(Lnzval_bc_ptr =
(double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) {
fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[].");
return (memDist + memNLU);
}
if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) {
fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[].");
return (memDist + memNLU);
}
memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*);
Lnzval_bc_ptr[nsupers_j-1] = NULL;
Lrowind_bc_ptr[nsupers_j-1] = NULL;
/* These lists of processes will be used for triangular solves. */
if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
fprintf(stderr, "Malloc fails for fsendx_plist[].");
return (memDist + memNLU);
}
len = nsupers_j * grid->nprow;
if ( !(index = intMalloc_dist(len)) ) {
fprintf(stderr, "Malloc fails for fsendx_plist[0]");
return (memDist + memNLU);
}
for (i = 0; i < len; ++i) index[i] = EMPTY;
for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
fsendx_plist[i] = &index[j];
if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) {
fprintf(stderr, "Malloc fails for bsendx_plist[].");
return (memDist + memNLU);
}
if ( !(index = intMalloc_dist(len)) ) {
fprintf(stderr, "Malloc fails for bsendx_plist[0]");
return (memDist + memNLU);
}
for (i = 0; i < len; ++i) index[i] = EMPTY;
for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow)
bsendx_plist[i] = &index[j];
/* -------------------------------------------------------------- */
memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
/*------------------------------------------------------------
PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
------------------------------------------------------------*/
for (jb = 0; jb < nsupers; jb++) {
jbcol = PCOL( jb, grid );
jbrow = PROW( jb, grid );
ljb_j = LBj( jb, grid ); /* Local block number column wise */
ljb_i = LBi( jb, grid); /* Local block number row wise */
fsupc = FstBlockC( jb );
nsupc = SuperSize( jb );
if ( myrow == jbrow ) { /* Block row jb in my process row */
/* Scatter A into SPA. */
for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) {
for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) {
if (i >= asup_rowptr[ilsum[nsupers_i]])
printf ("ERR7\n");
jcol = asup_colind[i];
if (jcol >= n)
printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n",
iam, jb, gb, j, jcol);
gb = BlockNum( jcol );
lb = LBj( gb, grid );
if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n");
jcol = ilsum_j[lb] + jcol - FstBlockC( gb );
if (jcol >= ldaspa_j)
printf ("Pe[%d] ERR1 jb %d gb %d j %d jcol %d\n",
iam, jb, gb, j, jcol);
dense_col[jcol] = asup_val[i];
}
dense_col += ldaspa_j;
}
/*------------------------------------------------
* SET UP U BLOCKS.
*------------------------------------------------*/
/* Count number of blocks and length of each block. */
nrbu = 0;
len = 0; /* Number of column subscripts I own. */
len1 = 0; /* number of fstnz subscripts */
for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
if (i >= xusub[nsupers_i]) printf ("ERR10\n");
jcol = usub[i];
gb = BlockNum( jcol ); /* Global block number */
/*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986)
printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n",
iam, jb, gb, jcol, jbcol, pc); */
lb = LBj( gb, grid ); /* Local block number */
pc = PCOL( gb, grid ); /* Process col owning this block */
if (mycol == jbcol) ToSendR[ljb_j][pc] = YES;
/* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */
pr = PROW( gb, grid );
if ( pr != jbrow && mycol == pc)
bsendx_plist[lb][jbrow] = YES;
if (mycol == pc) {
len += nsupc;
LUb_length[lb] += nsupc;
ToSendD[ljb_i] = YES;
if (Urb_marker[lb] <= jb) { /* First see this block */
if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++;
Urb_marker[lb] = jb + 1;
LUb_number[nrbu] = gb;
/* if (gb == 391825 && jb == 145361)
printf ("Pe[%d] T1 [%d %d] nrbu %d \n",
iam, jb, gb, nrbu); */
nrbu ++;
len1 += SuperSize( gb );
if ( gb != jb )/* Exclude diagonal block. */
++bmod[ljb_i];/* Mod. count for back solve */
#if ( PRNTlevel>=1 )
++nUblocks;
#endif
}
}
} /* for i ... */
if ( nrbu ) {
/* Sort the blocks of U in increasing block column index.
SuperLU_DIST assumes this is true */
/* simple insert sort algorithm */
/* to be transformed in quick sort */
for (j = 1; j < nrbu; j++) {
k = LUb_number[j];
for (i=j-1; i>=0 && LUb_number[i] > k; i--) {
LUb_number[i+1] = LUb_number[i];
}
LUb_number[i+1] = k;
}
/* Set up the initial pointers for each block in
index[] and nzval[]. */
/* Add room for descriptors */
len1 += BR_HEADER + nrbu * UB_DESCRIPTOR;
if ( !(index = intMalloc_dist(len1+1)) ) {
fprintf (stderr, "Malloc fails for Uindex[]");
return (memDist + memNLU);
}
Ufstnz_br_ptr[ljb_i] = index;
if (!(Unzval_br_ptr[ljb_i] =
doubleMalloc_dist(len))) {
fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]");
return (memDist + memNLU);
}
memNLU += (len1+1)*iword + len*dword;
uval = Unzval_br_ptr[ljb_i];
mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
index[0] = nrbu; /* Number of column blocks */
index[1] = len; /* Total length of nzval[] */
index[2] = len1; /* Total length of index */
index[len1] = -1; /* End marker */
next_ind = BR_HEADER;
next_val = 0;
for (k = 0; k < nrbu; k++) {
gb = LUb_number[k];
lb = LBj( gb, grid );
len = LUb_length[lb];
LUb_length[lb] = 0; /* Reset vector of block length */
index[next_ind++] = gb; /* Descriptor */
index[next_ind++] = len;
LUb_indptr[lb] = next_ind;
for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++)
index[next_ind] = FstBlockC( jb + 1 );
LUb_valptr[lb] = next_val;
next_val += len;
}
/* Propagate the fstnz subscripts to Ufstnz_br_ptr[],
and the initial values of A from SPA into Unzval_br_ptr[]. */
for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
jcol = usub[i];
gb = BlockNum( jcol );
if ( mycol == PCOL( gb, grid ) ) {
lb = LBj( gb, grid );
k = LUb_indptr[lb]; /* Start fstnz in index */
index[k + jcol - FstBlockC( gb )] = FstBlockC( jb );
}
} /* for i ... */
for (i = 0; i < nrbu; i++) {
gb = LUb_number[i];
lb = LBj( gb, grid );
next_ind = LUb_indptr[lb];
k = FstBlockC( jb + 1);
jcol = ilsum_j[lb];
for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) {
dense_col = dense;
j = index[next_ind+jj];
for (ii = j; ii < k; ii++) {
uval[LUb_valptr[lb]++] = dense_col[jcol];
dense_col[jcol] = zero;
dense_col += ldaspa_j;
}
}
}
} else {
Ufstnz_br_ptr[ljb_i] = NULL;
Unzval_br_ptr[ljb_i] = NULL;
} /* if nrbu ... */
} /* if myrow == jbrow */
/*------------------------------------------------
* SET UP L BLOCKS.
*------------------------------------------------*/
if (mycol == jbcol) { /* Block column jb in my process column */
/* Scatter A_inf into SPA. */
for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) {
for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) {
irow = ainf_rowind[i];
if (irow >= n) printf ("Pe[%d] ERR1\n", iam);
gb = BlockNum( irow );
if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam);
if ( myrow == PROW( gb, grid ) ) {
lb = LBi( gb, grid );
irow = ilsum[lb] + irow - FstBlockC( gb );
if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam);
dense_col[irow] = ainf_val[i];
}
}
dense_col += ldaspa;
}
/* sort the indices of the diagonal block at the beginning of xlsub */
if (myrow == jbrow) {
k = xlsub[ljb_j];
for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
irow = lsub[i];
if (irow < nsupc + fsupc && i != k+irow-fsupc) {
lsub[i] = lsub[k + irow - fsupc];
lsub[k + irow - fsupc] = irow;
i --;
}
}
}
/* Count number of blocks and length of each block. */
nrbl = 0;
len = 0; /* Number of row subscripts I own. */
kseen = 0;
for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
irow = lsub[i];
gb = BlockNum( irow ); /* Global block number */
pr = PROW( gb, grid ); /* Process row owning this block */
if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY &&
myrow == jbrow) {
fsendx_plist[ljb_j][pr] = YES;
++nfsendx;
}
if ( myrow == pr ) {
lb = LBi( gb, grid ); /* Local block number */
if (Lrb_marker[lb] <= jb) { /* First see this block */
Lrb_marker[lb] = jb + 1;
LUb_length[lb] = 1;
LUb_number[nrbl++] = gb;
if ( gb != jb ) /* Exclude diagonal block. */
++fmod[lb]; /* Mod. count for forward solve */
if ( kseen == 0 && myrow != jbrow ) {
++nfrecvx;
kseen = 1;
}
#if ( PRNTlevel>=1 )
++nLblocks;
#endif
} else
++LUb_length[lb];
++len;
}
} /* for i ... */
if ( nrbl ) { /* Do not ensure the blocks are sorted! */
/* Set up the initial pointers for each block in
index[] and nzval[]. */
/* If I am the owner of the diagonal block, order it first in LUb_number.
Necessary for SuperLU_DIST routines */
kseen = EMPTY;
for (j = 0; j < nrbl; j++) {
if (LUb_number[j] == jb)
kseen = j;
}
if (kseen != EMPTY && kseen != 0) {
LUb_number[kseen] = LUb_number[0];
LUb_number[0] = jb;
}
/* Add room for descriptors */
len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
if ( !(index = intMalloc_dist(len1)) ) {
fprintf (stderr, "Malloc fails for index[]");
return (memDist + memNLU);
}
Lrowind_bc_ptr[ljb_j] = index;
if (!(Lnzval_bc_ptr[ljb_j] =
doubleMalloc_dist(len*nsupc))) {
fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d ", jb);
return (memDist + memNLU);
}
memNLU += len1*iword + len*nsupc*dword;
lusup = Lnzval_bc_ptr[ljb_j];
mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
index[0] = nrbl; /* Number of row blocks */
index[1] = len; /* LDA of the nzval[] */
next_ind = BC_HEADER;
next_val = 0;
for (k = 0; k < nrbl; ++k) {
gb = LUb_number[k];
lb = LBi( gb, grid );
len = LUb_length[lb];
LUb_length[lb] = 0;
index[next_ind++] = gb; /* Descriptor */
index[next_ind++] = len;
LUb_indptr[lb] = next_ind;
LUb_valptr[lb] = next_val;
next_ind += len;
next_val += len;
}
/* Propagate the compressed row subscripts to Lindex[],
and the initial values of A from SPA into Lnzval[]. */
len = index[1]; /* LDA of lusup[] */
for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) {
irow = lsub[i];
gb = BlockNum( irow );
if ( myrow == PROW( gb, grid ) ) {
lb = LBi( gb, grid );
k = LUb_indptr[lb]++; /* Random access a block */
index[k] = irow;
k = LUb_valptr[lb]++;
irow = ilsum[lb] + irow - FstBlockC( gb );
for (j = 0, dense_col = dense; j < nsupc; ++j) {
lusup[k] = dense_col[irow];
dense_col[irow] = zero;
k += len;
dense_col += ldaspa;
}
}
} /* for i ... */
} else {
Lrowind_bc_ptr[ljb_j] = NULL;
Lnzval_bc_ptr[ljb_j] = NULL;
} /* if nrbl ... */
} /* if mycol == pc */
} /* for jb ... */
 
SUPERLU_FREE(ilsum_j);
SUPERLU_FREE(Urb_marker);
SUPERLU_FREE(LUb_length);
SUPERLU_FREE(LUb_indptr);
SUPERLU_FREE(LUb_number);
SUPERLU_FREE(LUb_valptr);
SUPERLU_FREE(Lrb_marker);
SUPERLU_FREE(dense);
/* Free the memory used for storing L and U */
SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub);
if (lsub != NULL)
SUPERLU_FREE(lsub);
if (usub != NULL)
SUPERLU_FREE(usub);
/* Free the memory used for storing A */
SUPERLU_FREE(ainf_colptr);
if (ainf_rowind != NULL) {
SUPERLU_FREE(ainf_rowind);
SUPERLU_FREE(ainf_val);
}
SUPERLU_FREE(asup_rowptr);
if (asup_colind != NULL) {
SUPERLU_FREE(asup_colind);
SUPERLU_FREE(asup_val);
}
/* exchange information about bsendx_plist in between column of processors */
k = SUPERLU_MAX( grid->nprow, grid->npcol);
if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) {
fprintf (stderr, "Malloc fails for recvBuf[].");
return (memDist + memNLU);
}
if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
fprintf (stderr, "Malloc fails for nnzToRecv[].");
return (memDist + memNLU);
}
if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
fprintf (stderr, "Malloc fails for ptrToRecv[].");
return (memDist + memNLU);
}
if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
fprintf (stderr, "Malloc fails for nnzToRecv[].");
return (memDist + memNLU);
}
if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) {
fprintf (stderr, "Malloc fails for ptrToRecv[].");
return (memDist + memNLU);
}
if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int)))
memDist = nsupers*k*iword +4*nprocs * sizeof(int);
for (p = 0; p < nprocs; p++)
nnzToRecv[p] = 0;
for (jb = 0; jb < nsupers; jb++) {
jbcol = PCOL( jb, grid );
jbrow = PROW( jb, grid );
p = PNUM(jbrow, jbcol, grid);
nnzToRecv[p] += grid->npcol;
}
i = 0;
for (p = 0; p < nprocs; p++) {
ptrToRecv[p] = i;
i += nnzToRecv[p];
ptrToSend[p] = 0;
if (p != iam)
nnzToSend[p] = nnzToRecv[iam];
else
nnzToSend[p] = 0;
}
nnzToRecv[iam] = 0;
i = ptrToRecv[iam];
for (jb = 0; jb < nsupers; jb++) {
jbcol = PCOL( jb, grid );
jbrow = PROW( jb, grid );
p = PNUM(jbrow, jbcol, grid);
if (p == iam) {
ljb_j = LBj( jb, grid ); /* Local block number column wise */
for (j = 0; j < grid->npcol; j++, i++)
recvBuf[i] = ToSendR[ljb_j][j];
}
}
MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm);
for (jb = 0; jb < nsupers; jb++) {
jbcol = PCOL( jb, grid );
jbrow = PROW( jb, grid );
p = PNUM(jbrow, jbcol, grid);
ljb_j = LBj( jb, grid ); /* Local block number column wise */
ljb_i = LBi( jb, grid ); /* Local block number row wise */
/* (myrow == jbrow) {
if (ToSendD[ljb_i] == YES)
ToRecv[jb] = 1;
}
else {
if (recvBuf[ptrToRecv[p] + mycol] == YES)
ToRecv[jb] = 2;
} */
if (recvBuf[ptrToRecv[p] + mycol] == YES) {
if (myrow == jbrow)
ToRecv[jb] = 1;
else
ToRecv[jb] = 2;
}
if (mycol == jbcol) {
for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++)
ToSendR[ljb_j][i] = recvBuf[j];
ToSendR[ljb_j][mycol] = EMPTY;
}
ptrToRecv[p] += grid->npcol;
}
/* exchange information about bsendx_plist in between column of processors */
MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
MPI_MAX, grid->cscp.comm);
for (jb = 0; jb < nsupers; jb ++) {
jbcol = PCOL( jb, grid);
jbrow = PROW( jb, grid);
if (mycol == jbcol) {
ljb_j = LBj( jb, grid ); /* Local block number column wise */
if (myrow == jbrow ) {
for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) {
(*bsendx_plist)[k] = recvBuf[k];
if ((*bsendx_plist)[k] != EMPTY)
nbsendx ++;
}
}
else {
for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++)
(*bsendx_plist)[k] = EMPTY;
}
}
}
SUPERLU_FREE(nnzToRecv);
SUPERLU_FREE(ptrToRecv);
SUPERLU_FREE(nnzToSend);
SUPERLU_FREE(ptrToSend);
SUPERLU_FREE(recvBuf);
Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
Llu->Unzval_br_ptr = Unzval_br_ptr;
Llu->ToRecv = ToRecv;
Llu->ToSendD = ToSendD;
Llu->ToSendR = ToSendR;
Llu->fmod = fmod;
Llu->fsendx_plist = fsendx_plist;
Llu->nfrecvx = nfrecvx;
Llu->nfsendx = nfsendx;
Llu->bmod = bmod;
Llu->bsendx_plist = bsendx_plist;
Llu->nbrecvx = nbrecvx;
Llu->nbsendx = nbsendx;
Llu->ilsum = ilsum;
Llu->ldalsum = ldaspa;
LUstruct->Glu_persist = Glu_persist;
#if ( PRNTlevel>=1 )
if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n",
nLblocks, nUblocks);
#endif
k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
if ( !(Llu->mod_bit = intMalloc_dist(k)) )
ABORT("Malloc fails for mod_bit[].");
 
/* Find the maximum buffer size. */
MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t,
MPI_MAX, grid->comm);
#if ( DEBUGlevel>=1 )
/* Memory allocated but not freed:
ilsum, fmod, fsendx_plist, bmod, bsendx_plist,
ToRecv, ToSendR, ToSendD, mod_bit
*/
CHECK_MALLOC(iam, "Exit dist_psymbtonum()");
#endif
return (- (memDist+memNLU));
} /* ddist_psymbtonum */
 
/trunk/OTHER/SuperLU_DIST_2.5/SRC/Cnames.h
New file
0,0 → 1,349
/*! @file
* \brief Macro definitions
*
* <pre>
* -- Distributed SuperLU routine (version 1.0) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* September 1, 1999
* </pre>
*/
 
#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */
#define __SUPERLU_CNAMES
 
/*
* These macros define how C routines will be called. ADD_ assumes that
* they will be called by fortran, which expects C routines to have an
* underscore postfixed to the name (Suns, and the Intel expect this).
* NOCHANGE indicates that fortran will be calling, and that it expects
* the name called by fortran to be identical to that compiled by the C
* (RS6K's do this). UPCASE says it expects C routines called by fortran
* to be in all upcase (CRAY wants this).
*/
 
#define ADD_ 0
#define NOCHANGE 1
#define UPCASE 2
#define C_CALL 3
 
#ifdef UpCase
#define F77_CALL_C UPCASE
#endif
 
#ifdef NoChange
#define F77_CALL_C NOCHANGE
#endif
 
#ifdef Add_
#define F77_CALL_C ADD_
#endif
 
#ifndef F77_CALL_C
#define F77_CALL_C ADD_
#endif
 
#if (F77_CALL_C == ADD_)
/*
* These defines set up the naming scheme required to have a fortran 77
* routine call a C routine
* No redefinition necessary to have following Fortran to C interface:
* FORTRAN CALL C DECLARATION
* call dgemm(...) void dgemm_(...)
*
* This is the default.
*/
/* These are the functions defined in F90 wraper */
#define f_create_gridinfo_handle f_create_gridinfo_handle_
#define f_create_options_handle f_create_options_handle_
#define f_create_ScalePerm_handle f_create_scaleperm_handle_
#define f_create_LUstruct_handle f_create_lustruct_handle_
#define f_create_SOLVEstruct_handle f_create_solvestruct_handle_
#define f_create_SuperMatrix_handle f_create_supermatrix_handle_
#define f_destroy_gridinfo_handle f_destroy_gridinfo_handle_
#define f_destroy_options_handle f_destroy_options_handle_
#define f_destroy_ScalePerm_handle f_destroy_scaleperm_handle_
#define f_destroy_LUstruct_handle f_destroy_lustruct_handle_
#define f_destroy_SOLVEstruct_handle f_destroy_solvestruct_handle_
#define f_destroy_SuperMatrix_handle f_destroy_supermatrix_handle_
#define f_create_SuperLUStat_handle f_create_superlustat_handle_
#define f_destroy_SuperLUStat_handle f_destroy_superlustat_handle_
#define f_get_gridinfo f_get_gridinfo_
#define f_get_SuperMatrix f_get_supermatrix_
#define f_set_SuperMatrix f_set_supermatrix_
#define f_get_CompRowLoc_Matrix f_get_comprowloc_matrix_
#define f_set_CompRowLoc_Matrix f_set_comprowloc_matrix_
#define f_get_superlu_options f_get_superlu_options_
#define f_set_superlu_options f_set_superlu_options_
#define f_set_default_options f_set_default_options_
#define f_superlu_gridinit f_superlu_gridinit_
#define f_superlu_gridexit f_superlu_gridexit_
#define f_ScalePermstructInit f_scalepermstructinit_
#define f_ScalePermstructFree f_scalepermstructfree_
#define f_PStatInit f_pstatinit_
#define f_PStatFree f_pstatfree_
#define f_LUstructInit f_lustructinit_
#define f_LUstructFree f_lustructfree_
#define f_Destroy_LU f_destroy_lu_
#define f_dCreate_CompRowLoc_Mat_dist f_dcreate_comprowloc_mat_dist_
#define f_Destroy_CompRowLoc_Mat_dist f_destroy_comprowloc_mat_dist_
#define f_Destroy_SuperMat_Store_dist f_destroy_supermat_store_dist_
#define f_dSolveFinalize f_dsolvefinalize_
#define f_pdgssvx f_pdgssvx_
#define f_dcreate_dist_matrix f_dcreate_dist_matrix_
#define f_check_malloc f_check_malloc_
#endif
 
#if (F77_CALL_C == UPCASE)
/*
* These defines set up the naming scheme required to have a fortran 77
* routine call a C routine
* following Fortran to C interface:
* FORTRAN CALL C DECLARATION
* call dgemm(...) void DGEMM(...)
*/
/* BLAS */
#define sasum_ SASUM
#define isamax_ ISAMAX
#define scopy_ SCOPY
#define sscal_ SSCAL
#define sger_ SGER
#define snrm2_ SNRM2
#define ssymv_ SSYMV
#define sdot_ SDOT
#define saxpy_ SAXPY
#define ssyr2_ SSYR2
#define srot_ SROT
#define sgemv_ SGEMV
#define strsv_ STRSV
#define sgemm_ SGEMM
#define strsm_ STRSM
 
#define dasum_ DASUM
#define idamax_ IDAMAX
#define dcopy_ DCOPY
#define dscal_ DSCAL
#define dger_ DGER
#define dnrm2_ DNRM2
#define dsymv_ DSYMV
#define ddot_ DDOT
#define daxpy_ DAXPY
#define dsyr2_ DSYR2
#define drot_ DROT
#define dgemv_ DGEMV
#define dtrsv_ DTRSV
#define dgemm_ DGEMM
#define dtrsm_ DTRSM
 
#define scasum_ SCASUM
#define icamax_ ICAMAX
#define ccopy_ CCOPY
#define cscal_ CSCAL
#define scnrm2_ SCNRM2
#define caxpy_ CAXPY
#define cgemv_ CGEMV
#define ctrsv_ CTRSV
#define cgemm_ CGEMM
#define ctrsm_ CTRSM
#define cgerc_ CGERC
#define chemv_ CHEMV
#define cher2_ CHER2
 
#define dzasum_ DZASUM
#define izamax_ IZAMAX
#define zcopy_ ZCOPY
#define zscal_ ZSCAL
#define dznrm2_ DZNRM2
#define zaxpy_ ZAXPY
#define zgemv_ ZGEMV
#define ztrsv_ ZTRSV
#define zgemm_ ZGEMM
#define ztrsm_ ZTRSM
#define zgerc_ ZGERC
#define zhemv_ ZHEMV
#define zher2_ ZHER2
#define zgeru_ ZGERU
 
/* LAPACK */
#define dlamch_ DLAMCH
#define slamch_ SLAMCH
#define xerbla_ XERBLA
#define lsame_ LSAME
 
#define mc64id_ MC64ID
#define mc64ad_ MC64AD
#define c_bridge_dgssv_ C_BRIDGE_DGSSV
#define c_fortran_slugrid_ C_FORTRAN_SLUGRID
#define c_fortran_pdgssvx_ C_FORTRAN_PDGSSVX
#define c_fortran_pdgssvx_ABglobal_ C_FORTRAN_PDGSSVX_ABGLOBAL
#define c_fortran_pzgssvx_ C_FORTRAN_PZGSSVX
#define c_fortran_pzgssvx_ABglobal_ C_FORTRAN_PZGSSVX_ABGLOBAL
 
/* These are the functions defined in F90 wraper */
#define f_create_gridinfo_handle F_CREATE_GRIDINFO_HANDLE
#define f_create_options_handle F_CREATE_OPTIONS_HANDLE
#define f_create_ScalePerm_handle F_CREATE_SCALEPERM_HANDLE
#define f_create_LUstruct_handle F_CREATE_LUSTRUCT_HANDLE
#define f_create_SOLVEstruct_handle F_CREATE_SOLVESTRUCT_HANDLE
#define f_create_SuperMatrix_handle F_CREATE_SUPERMATRIX_HANDLE
#define f_destroy_gridinfo_handle F_DESTROY_GRIDINFO_HANDLE
#define f_destroy_options_handle F_DESTROY_OPTIONS_HANDLE
#define f_destroy_ScalePerm_handle F_DESTROY_SCALEPERM_HANDLE
#define f_destroy_LUstruct_handle F_DESTROY_LUSTRUCT_HANDLE
#define f_destroy_SOLVEstruct_handle F_DESTROY_SOLVESTRUCT_HANDLE
#define f_destroy_SuperMatrix_handle F_DESTROY_SUPERMATRIX_HANDLE
#define f_create_SuperLUStat_handle F_CREATE_SUPERLUSTAT_HANDLE
#define f_destroy_SuperLUStat_handle F_DESTROY_SUPERLUSTAT_HANDLE
#define f_get_gridinfo F_GET_GRIDINFO
#define f_get_SuperMatrix F_GET_SUPERMATRIX
#define f_set_SuperMatrix F_SET_SUPERMATRIX
#define f_get_CompRowLoc_Matrix F_GET_COMPROWLOC_MATRIX
#define f_set_CompRowLoc_Matrix F_SET_COMPROWLOC_MATRIX
#define f_get_superlu_options F_GET_SUPERLU_OPTIONS
#define f_set_superlu_options F_SET_SUPERLU_OPTIONS
#define f_set_default_options F_SET_DEFAULT_OPTIONS
#define f_superlu_gridinit F_SUPERLU_GRIDINIT
#define f_superlu_gridexit F_SUPERLU_GRIDEXIT
#define f_ScalePermstructInit F_SCALEPERMSTRUCTINIT
#define f_ScalePermstructFree F_SCALEPERMSTRUCTFREE
#define f_PStatInit F_PSTATINIT
#define f_PStatFree F_PSTATFREE
#define f_LUstructInit F_LUSTRUCTINIT
#define f_LUstructFree F_LUSTRUCTFREE
#define f_Destroy_LU F_DESTROY_LU
#define f_dCreate_CompRowLoc_Mat_dist F_DCREATE_COMPROWLOC_MAT_DIST
#define f_Destroy_CompRowLoc_Mat_dist F_DESTROY_COMPROWLOC_MAT_DIST
#define f_Destroy_SuperMat_Store_dist F_DESTROY_SUPERMAT_STORE_DIST
#define f_dSolveFinalize F_DSOLVEFINALIZE
#define f_pdgssvx F_PDGSSVX
#define f_dcreate_dist_matrix F_DCREATE_DIST_MATRIX
#define f_check_malloc F_CHECK_MALLOC
#endif
 
#if (F77_CALL_C == NOCHANGE)
/*
* These defines set up the naming scheme required to have a fortran 77
* routine call a C routine
* for following Fortran to C interface:
* FORTRAN CALL C DECLARATION
* call dgemm(...) void dgemm(...)
*/
/* BLAS */
#define sasum_ sasum
#define isamax_ isamax
#define scopy_ scopy
#define sscal_ sscal
#define sger_ sger
#define snrm2_ snrm2
#define ssymv_ ssymv
#define sdot_ sdot
#define saxpy_ saxpy
#define ssyr2_ ssyr2
#define srot_ srot
#define sgemv_ sgemv
#define strsv_ strsv
#define sgemm_ sgemm
#define strsm_ strsm
 
#define dasum_ dasum
#define idamax_ idamax
#define dcopy_ dcopy
#define dscal_ dscal
#define dger_ dger
#define dnrm2_ dnrm2
#define dsymv_ dsymv
#define ddot_ ddot
#define daxpy_ daxpy
#define dsyr2_ dsyr2
#define drot_ drot
#define dgemv_ dgemv
#define dtrsv_ dtrsv
#define dgemm_ dgemm
#define dtrsm_ dtrsm
 
#define scasum_ scasum
#define icamax_ icamax
#define ccopy_ ccopy
#define cscal_ cscal
#define scnrm2_ scnrm2
#define caxpy_ caxpy
#define cgemv_ cgemv
#define ctrsv_ ctrsv
#define cgemm_ cgemm
#define ctrsm_ ctrsm
#define cgerc_ cgerc
#define chemv_ chemv
#define cher2_ cher2
 
#define dzasum_ dzasum
#define izamax_ izamax
#define zcopy_ zcopy
#define zscal_ zscal
#define dznrm2_ dznrm2
#define zaxpy_ zaxpy
#define zgemv_ zgemv
#define ztrsv_ ztrsv
#define zgemm_ zgemm
#define ztrsm_ ztrsm
#define zgerc_ zgerc
#define zhemv_ zhemv
#define zher2_ zher2
#define zgeru_ zgeru
 
/* LAPACK */
#define dlamch_ dlamch
#define slamch_ slamch
#define xerbla_ xerbla
#define lsame_ lsame
 
#define mc64id_ mc64id
#define mc64ad_ mc64ad
 
#define c_bridge_dgssv_ c_bridge_dgssv
#define c_fortran_slugrid_ c_fortran_slugrid
#define c_fortran_pdgssvx_ c_fortran_pdgssvx
#define c_fortran_pdgssvx_ABglobal_ c_fortran_pdgssvx_abglobal
#define c_fortran_pzgssvx_ c_fortran_pzgssvx
#define c_fortran_pzgssvx_ABglobal_ c_fortran_pzgssvx_abglobal
 
/* These are the functions defined in F90 wraper */
#define f_create_gridinfo_handle f_create_gridinfo_handle
#define f_create_options_handle f_create_options_handle
#define f_create_ScalePerm_handle f_create_scaleperm_handle
#define f_create_LUstruct_handle f_create_lustruct_handle
#define f_create_SOLVEstruct_handle f_create_solvestruct_handle
#define f_create_SuperMatrix_handle f_create_supermatrix_handle
#define f_destroy_gridinfo_handle f_destroy_gridinfo_handle
#define f_destroy_options_handle f_destroy_options_handle
#define f_destroy_ScalePerm_handle f_destroy_scaleperm_handle
#define f_destroy_LUstruct_handle f_destroy_lustruct_handle
#define f_destroy_SOLVEstruct_handle f_destroy_solvestruct_handle
#define f_destroy_SuperMatrix_handle f_destroy_supermatrix_handle
#define f_create_SuperLUStat_handle f_create_superlustat_handle
#define f_destroy_SuperLUStat_handle f_destroy_superlustat_handle
#define f_get_gridinfo f_get_gridinfo
#define f_get_SuperMatrix f_get_supermatrix
#define f_set_SuperMatrix f_set_supermatrix
#define f_get_CompRowLoc_Matrix f_get_comprowloc_matrix
#define f_set_CompRowLoc_Matrix f_set_comprowloc_matrix
#define f_get_superlu_options f_get_superlu_options
#define f_set_superlu_options f_set_superlu_options
#define f_set_default_options f_set_default_options
#define f_superlu_gridinit f_superlu_gridinit
#define f_superlu_gridexit f_superlu_gridexit
#define f_ScalePermstructInit f_scalepermstructinit
#define f_ScalePermstructFree f_scalepermstructfree
#define f_PStatInit f_pstatinit
#define f_PStatFree f_pstatfree
#define f_LUstructInit f_lustructinit
#define f_LUstructFree f_lustructfree
#define f_Destroy_LU f_destroy_lu
#define f_dCreate_CompRowLoc_Mat_dist f_dcreate_comprowloc_mat_dist
#define f_Destroy_CompRowLoc_Mat_dist f_destroy_comprowloc_mat_dist
#define f_Destroy_SuperMat_Store_dist f_destroy_supermat_store_dist
#define f_dSolveFinalize f_dsolvefinalize
#define f_pdgssvx f_pdgssvx
#define f_dcreate_dist_matrix f_dcreate_dist_matrix
#define f_check_malloc f_check_malloc
#endif
 
#endif /* __SUPERLU_CNAMES */
/trunk/OTHER/SuperLU_DIST_2.5/SRC/dmemory.c
New file
0,0 → 1,152
/*! @file
* \brief Memory utilities
*
* <pre>
* -- Distributed SuperLU routine (version 1.0) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* September 1, 1999
* </pre>
*/
 
#include "superlu_ddefs.h"
 
 
/* Variables external to this file */
extern LU_stack_t stack;
 
 
void *duser_malloc_dist(int_t bytes, int_t which_end)
{
void *buf;
if ( StackFull(bytes) ) return (NULL);
 
if ( which_end == HEAD ) {
buf = (char*) stack.array + stack.top1;
stack.top1 += bytes;
} else {
stack.top2 -= bytes;
buf = (char*) stack.array + stack.top2;
}
stack.used += bytes;
return buf;
}
 
 
void duser_free_dist(int_t bytes, int_t which_end)
{
if ( which_end == HEAD ) {
stack.top1 -= bytes;
} else {
stack.top2 += bytes;
}
stack.used -= bytes;
}
 
 
 
/*! \brief
*
* <pre>
* mem_usage consists of the following fields:
* - for_lu (float)
* The amount of space used in bytes for the L\U data structures.
* - total (float)
* The amount of space needed in bytes to perform factorization.
* - expansions (int)
* Number of memory expansions during the LU factorization.
* </pre>
*/
int_t dQuerySpace_dist(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid,
mem_usage_t *mem_usage)
{
register int_t dword, gb, iword, k, maxsup, nb, nsupers;
int_t *index, *xsup;
int iam, mycol, myrow;
Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
LocalLU_t *Llu = LUstruct->Llu;
 
iam = grid->iam;
myrow = MYROW( iam, grid );
mycol = MYCOL( iam, grid );
iword = sizeof(int_t);
dword = sizeof(double);
maxsup = sp_ienv_dist(3);
nsupers = Glu_persist->supno[n-1] + 1;
xsup = Glu_persist->xsup;
mem_usage->for_lu = 0;
 
/* For L factor */
nb = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */
for (k = 0; k < nb; ++k) {
gb = k * grid->npcol + mycol; /* Global block number. */
if ( gb < nsupers ) {
index = Llu->Lrowind_bc_ptr[k];
if ( index ) {
mem_usage->for_lu += (float)
((BC_HEADER + index[0]*LB_DESCRIPTOR + index[1]) * iword);
mem_usage->for_lu += (float)(index[1]*SuperSize( gb )*dword);
}
}
}
 
/* For U factor */
nb = CEILING( nsupers, grid->nprow ); /* Number of local row blocks */
for (k = 0; k < nb; ++k) {
gb = k * grid->nprow + myrow; /* Global block number. */
if ( gb < nsupers ) {
index = Llu->Ufstnz_br_ptr[k];
if ( index ) {
mem_usage->for_lu += (float)(index[2] * iword);
mem_usage->for_lu += (float)(index[1] * dword);
}
}
}
 
/* Working storage to support factorization */
mem_usage->total = mem_usage->for_lu;
mem_usage->total +=
(float)(( Llu->bufmax[0] + Llu->bufmax[2] ) * iword +
( Llu->bufmax[1] + Llu->bufmax[3] + maxsup ) * dword );
/**** another buffer to use mpi_irecv in pdgstrf_irecv.c ****/
mem_usage->total +=
(float)( Llu->bufmax[0] * iword + Llu->bufmax[1] * dword );
mem_usage->total += (float)( maxsup * maxsup + maxsup) * iword;
k = CEILING( nsupers, grid->nprow );
mem_usage->total += (float)(2 * k * iword);
 
return 0;
} /* dQuerySpace_dist */
 
 
/*
* Allocate storage for original matrix A
*/
void
dallocateA_dist(int_t n, int_t nnz, double **a, int_t **asub, int_t **xa)
{
*a = (double *) doubleMalloc_dist(nnz);
*asub = (int_t *) intMalloc_dist(nnz);
*xa = (int_t *) intMalloc_dist(n+1);
}
 
 
double *doubleMalloc_dist(int_t n)
{
double *buf;
buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double) );
return (buf);
}
 
double *doubleCalloc_dist(int_t n)
{
double *buf;
register int_t i;
double zero = 0.0;
buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double));
if ( !buf ) return (buf);
for (i = 0; i < n; ++i) buf[i] = zero;
return (buf);
}
 
/trunk/OTHER/SuperLU_DIST_2.5/SRC/xerbla.c
New file
0,0 → 1,50
/*! @file
* \brief
 
<pre>
-- LAPACK auxiliary routine (version 2.0) --
Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
Courant Institute, Argonne National Lab, and Rice University
September 30, 1994
</pre>
*/
#include "Cnames.h"
 
/* Subroutine */
/*! \brief
 
<pre>
Purpose
=======
 
XERBLA is an error handler for the LAPACK routines.
It is called by an LAPACK routine if an input parameter has an
invalid value. A message is printed and execution stops.
 
Installers may consider modifying the STOP statement in order to
call system-specific exception-handling facilities.
 
Arguments
=========
 
SRNAME (input) CHARACTER*6
The name of the routine which called XERBLA.
 
INFO (input) INT
The position of the invalid parameter in the parameter list
 
of the calling routine.
 
=====================================================================
</pre>
*/
int xerbla_(char *srname, int *info)
{
printf("** On entry to %6s, parameter number %2d had an illegal value\n",
srname, *info);
 
/* End of XERBLA */
 
return 0;
} /* xerbla_ */
 
/trunk/OTHER/SuperLU_DIST_2.5/SRC/util.c
New file
0,0 → 1,786
/*! @file
* \brief Utilities functions
*
* <pre>
* -- Distributed SuperLU routine (version 2.0) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* February 1, 2003
* </pre>
*/
 
#include <math.h>
#include "superlu_ddefs.h"
 
 
/*! \brief Deallocate the structure pointing to the actual storage of the matrix. */
void
Destroy_SuperMatrix_Store_dist(SuperMatrix *A)
{
SUPERLU_FREE ( A->Store );
}
 
void
Destroy_CompCol_Matrix_dist(SuperMatrix *A)
{
NCformat *Astore = A->Store;
SUPERLU_FREE( Astore->rowind );
SUPERLU_FREE( Astore->colptr );
if ( Astore->nzval ) SUPERLU_FREE( Astore->nzval );
SUPERLU_FREE( Astore );
}
 
void
Destroy_CompRowLoc_Matrix_dist(SuperMatrix *A)
{
NRformat_loc *Astore = A->Store;
SUPERLU_FREE( Astore->rowptr );
SUPERLU_FREE( Astore->colind );
SUPERLU_FREE( Astore->nzval );
SUPERLU_FREE( Astore );
}
 
void
Destroy_CompRow_Matrix_dist(SuperMatrix *A)
{
SUPERLU_FREE( ((NRformat *)A->Store)->rowptr );
SUPERLU_FREE( ((NRformat *)A->Store)->colind );
SUPERLU_FREE( ((NRformat *)A->Store)->nzval );
SUPERLU_FREE( A->Store );
}
 
void
Destroy_SuperNode_Matrix_dist(SuperMatrix *A)
{
SUPERLU_FREE ( ((SCformat *)A->Store)->rowind );
SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr );
SUPERLU_FREE ( ((SCformat *)A->Store)->nzval );
SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr );
SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup );
SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col );
SUPERLU_FREE ( A->Store );
}
 
/*! \brief A is of type Stype==NCP */
void
Destroy_CompCol_Permuted_dist(SuperMatrix *A)
{
SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg );
SUPERLU_FREE ( ((NCPformat *)A->Store)->colend );
SUPERLU_FREE ( A->Store );
}
 
/*! \brief A is of type Stype==DN */
void
Destroy_Dense_Matrix_dist(SuperMatrix *A)
{
DNformat* Astore = A->Store;
SUPERLU_FREE (Astore->nzval);
SUPERLU_FREE ( A->Store );
}
 
/*! \brief Destroy distributed L & U matrices. */
void
Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct)
{
int_t i, nb, nsupers;
Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
LocalLU_t *Llu = LUstruct->Llu;
 
#if ( DEBUGlevel>=1 )
int iam;
MPI_Comm_rank( MPI_COMM_WORLD, &iam );
CHECK_MALLOC(iam, "Enter Destroy_LU()");
#endif
 
nsupers = Glu_persist->supno[n-1] + 1;
 
nb = CEILING(nsupers, grid->npcol);
for (i = 0; i < nb; ++i)
if ( Llu->Lrowind_bc_ptr[i] ) {
SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
}
SUPERLU_FREE (Llu->Lrowind_bc_ptr);
SUPERLU_FREE (Llu->Lnzval_bc_ptr);
 
nb = CEILING(nsupers, grid->nprow);
for (i = 0; i < nb; ++i)
if ( Llu->Ufstnz_br_ptr[i] ) {
SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
}
SUPERLU_FREE (Llu->Ufstnz_br_ptr);
SUPERLU_FREE (Llu->Unzval_br_ptr);
 
/* The following can be freed after factorization. */
SUPERLU_FREE(Llu->ToRecv);
SUPERLU_FREE(Llu->ToSendD);
SUPERLU_FREE(Llu->ToSendR[0]);
SUPERLU_FREE(Llu->ToSendR);
 
/* The following can be freed only after iterative refinement. */
SUPERLU_FREE(Llu->ilsum);
SUPERLU_FREE(Llu->fmod);
SUPERLU_FREE(Llu->fsendx_plist[0]);
SUPERLU_FREE(Llu->fsendx_plist);
SUPERLU_FREE(Llu->bmod);
SUPERLU_FREE(Llu->bsendx_plist[0]);
SUPERLU_FREE(Llu->bsendx_plist);
SUPERLU_FREE(Llu->mod_bit);
 
SUPERLU_FREE(Glu_persist->xsup);
SUPERLU_FREE(Glu_persist->supno);
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit Destroy_LU()");
#endif
}
 
/*! \brief Allocate storage in ScalePermstruct */
void ScalePermstructInit(const int_t m, const int_t n,
ScalePermstruct_t *ScalePermstruct)
{
ScalePermstruct->DiagScale = NOEQUIL;
if ( !(ScalePermstruct->perm_r = intMalloc_dist(m)) )
ABORT("Malloc fails for perm_r[].");
if ( !(ScalePermstruct->perm_c = intMalloc_dist(n)) )
ABORT("Malloc fails for perm_c[].");
}
 
/*! \brief Deallocate ScalePermstruct */
void ScalePermstructFree(ScalePermstruct_t *ScalePermstruct)
{
SUPERLU_FREE(ScalePermstruct->perm_r);
SUPERLU_FREE(ScalePermstruct->perm_c);
switch ( ScalePermstruct->DiagScale ) {
case ROW:
SUPERLU_FREE(ScalePermstruct->R);
break;
case COL:
SUPERLU_FREE(ScalePermstruct->C);
break;
case BOTH:
SUPERLU_FREE(ScalePermstruct->R);
SUPERLU_FREE(ScalePermstruct->C);
break;
}
}
 
/*! \brief Allocate storage in LUstruct */
void LUstructInit(const int_t m, const int_t n, LUstruct_t *LUstruct)
{
if ( !(LUstruct->etree = intMalloc_dist(n)) )
ABORT("Malloc fails for etree[].");
if ( !(LUstruct->Glu_persist = (Glu_persist_t *)
SUPERLU_MALLOC(sizeof(Glu_persist_t))) )
ABORT("Malloc fails for Glu_persist_t.");
if ( !(LUstruct->Llu = (LocalLU_t *)
SUPERLU_MALLOC(sizeof(LocalLU_t))) )
ABORT("Malloc fails for LocalLU_t.");
}
 
/*! \brief Deallocate LUstruct */
void LUstructFree(LUstruct_t *LUstruct)
{
#if ( DEBUGlevel>=1 )
int iam;
MPI_Comm_rank( MPI_COMM_WORLD, &iam );
CHECK_MALLOC(iam, "Enter LUstructFree()");
#endif
 
SUPERLU_FREE(LUstruct->etree);
SUPERLU_FREE(LUstruct->Glu_persist);
SUPERLU_FREE(LUstruct->Llu);
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit LUstructFree()");
#endif
}
 
/*! \brief
*
* <pre>
* Count the total number of nonzeros in factors L and U, and in the
* symmetrically reduced L.
* </pre>
*/
void
countnz_dist(const int_t n, int_t *xprune, int_t *nnzL, int_t *nnzU,
Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable)
{
int_t fnz, fsupc, i, j, nsuper;
int_t nnzL0, jlen, irep;
int_t *supno, *xsup, *xlsub, *xusub, *usub;
 
supno = Glu_persist->supno;
xsup = Glu_persist->xsup;
xlsub = Glu_freeable->xlsub;
xusub = Glu_freeable->xusub;
usub = Glu_freeable->usub;
*nnzL = 0;
*nnzU = 0;
nnzL0 = 0;
nsuper = supno[n];
 
if ( n <= 0 ) return;
 
/*
* For each supernode in L.
*/
for (i = 0; i <= nsuper; i++) {
fsupc = xsup[i];
jlen = xlsub[fsupc+1] - xlsub[fsupc];
 
for (j = fsupc; j < xsup[i+1]; j++) {
*nnzL += jlen;
*nnzU += j - fsupc + 1;
jlen--;
}
irep = xsup[i+1] - 1;
nnzL0 += xprune[irep] - xlsub[irep];
}
/* printf("\tNo of nonzeros in symm-reduced L = %d\n", nnzL0);*/
/* For each column in U. */
for (j = 0; j < n; ++j) {
for (i = xusub[j]; i < xusub[j+1]; ++i) {
fnz = usub[i];
fsupc = xsup[supno[fnz]+1];
*nnzU += fsupc - fnz;
}
}
}
 
 
 
/*! \brief
*
* <pre>
* Fix up the data storage lsub for L-subscripts. It removes the subscript
* sets for structural pruning, and applies permuation to the remaining
* subscripts.
* </pre>
*/
int_t
fixupL_dist(const int_t n, const int_t *perm_r,
Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable)
{
register int_t nsuper, fsupc, nextl, i, j, k, jstrt, lsub_size;
int_t *xsup, *lsub, *xlsub;
 
if ( n <= 1 ) return 0;
 
xsup = Glu_persist->xsup;
lsub = Glu_freeable->lsub;
xlsub = Glu_freeable->xlsub;
nextl = 0;
nsuper = (Glu_persist->supno)[n];
lsub_size = xlsub[n];
/*
* For each supernode ...
*/
for (i = 0; i <= nsuper; i++) {
fsupc = xsup[i];
jstrt = xlsub[fsupc];
xlsub[fsupc] = nextl;
for (j = jstrt; j < xlsub[fsupc+1]; j++) {
lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */
nextl++;
}
for (k = fsupc+1; k < xsup[i+1]; k++)
xlsub[k] = nextl; /* Other columns in supernode i */
 
}
 
xlsub[n] = nextl;
return lsub_size;
}
 
/*! \brief Set the default values for the options argument.
*/
void set_default_options_dist(superlu_options_t *options)
{
options->Fact = DOFACT;
options->Equil = YES;
options->ParSymbFact = NO;
options->ColPerm = METIS_AT_PLUS_A;
options->RowPerm = LargeDiag;
options->ReplaceTinyPivot = YES;
options->IterRefine = DOUBLE;
options->Trans = NOTRANS;
options->SolveInitialized = NO;
options->RefineInitialized = NO;
options->PrintStat = YES;
}
 
/*! \brief Print the options setting.
*/
void print_options_dist(superlu_options_t *options)
{
printf(".. options:\n");
printf("\tFact\t %8d\n", options->Fact);
printf("\tEquil\t %8d\n", options->Equil);
printf("\tColPerm\t %8d\n", options->ColPerm);
printf("\tRowPerm\t %8d\n", options->RowPerm);
printf("\tReplaceTinyPivot %4d\n", options->ReplaceTinyPivot);
printf("\tTrans\t %8d\n", options->Trans);
printf("\tIterRefine\t%4d\n", options->IterRefine);
printf("..\n");
}
 
/*! \brief
*
* <pre>
* Purpose
* =======
* Set up the communication pattern for the triangular solution.
*
* Arguments
* =========
*
* n (input) int (global)
* The dimension of the linear system.
*
* m_loc (input) int (local)
* The local row dimension of the distributed input matrix.
*
* nrhs (input) int (global)
* Number of right-hand sides.
*
* fst_row (input) int (global)
* The row number of matrix B's first row in the global matrix.
*
* perm_r (input) int* (global)
* The row permutation vector.
*
* perm_c (input) int* (global)
* The column permutation vector.
*
* grid (input) gridinfo_t*
* The 2D process mesh.
* </pre>
*/
int_t
pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row,
int_t perm_r[], int_t perm_c[], gridinfo_t *grid,
Glu_persist_t *Glu_persist, SOLVEstruct_t *SOLVEstruct)
{
 
int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs;
int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs;
int *itemp, *ptr_to_ibuf, *ptr_to_dbuf;
int_t *row_to_proc;
int_t i, gbi, k, l, num_diag_procs, *diag_procs;
int_t irow, lk, q, knsupc, nsupers, *xsup, *supno;
int iam, p, pkk, procs;
pxgstrs_comm_t *gstrs_comm;
 
procs = grid->nprow * grid->npcol;
iam = grid->iam;
gstrs_comm = SOLVEstruct->gstrs_comm;
xsup = Glu_persist->xsup;
supno = Glu_persist->supno;
nsupers = Glu_persist->supno[n-1] + 1;
row_to_proc = SOLVEstruct->row_to_proc;
 
/* ------------------------------------------------------------
SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X.
------------------------------------------------------------*/
if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) )
ABORT("Malloc fails for B_to_X_itemp[].");
SendCnt = itemp;
SendCnt_nrhs = itemp + procs;
RecvCnt = itemp + 2*procs;
RecvCnt_nrhs = itemp + 3*procs;
sdispls = itemp + 4*procs;
sdispls_nrhs = itemp + 5*procs;
rdispls = itemp + 6*procs;
rdispls_nrhs = itemp + 7*procs;
 
/* Count the number of elements to be sent to each diagonal process.*/
for (p = 0; p < procs; ++p) SendCnt[p] = 0;
for (i = 0, l = fst_row; i < m_loc; ++i, ++l) {
irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */
gbi = BlockNum( irow );
p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
++SendCnt[p];
}
/* Set up the displacements for alltoall. */
MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
sdispls[0] = rdispls[0] = 0;
for (p = 1; p < procs; ++p) {
sdispls[p] = sdispls[p-1] + SendCnt[p-1];
rdispls[p] = rdispls[p-1] + RecvCnt[p-1];
}
for (p = 0; p < procs; ++p) {
SendCnt_nrhs[p] = SendCnt[p] * nrhs;
sdispls_nrhs[p] = sdispls[p] * nrhs;
RecvCnt_nrhs[p] = RecvCnt[p] * nrhs;
rdispls_nrhs[p] = rdispls[p] * nrhs;
}
 
/* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/
gstrs_comm->B_to_X_SendCnt = SendCnt;
 
/* ------------------------------------------------------------
SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B.
------------------------------------------------------------*/
/* This is freed in pxgstrs_finalize(). */
if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) )
ABORT("Malloc fails for X_to_B_itemp[].");
SendCnt = itemp;
SendCnt_nrhs = itemp + procs;
RecvCnt = itemp + 2*procs;
RecvCnt_nrhs = itemp + 3*procs;
sdispls = itemp + 4*procs;
sdispls_nrhs = itemp + 5*procs;
rdispls = itemp + 6*procs;
rdispls_nrhs = itemp + 7*procs;
 
/* Count the number of X entries to be sent to each process.*/
for (p = 0; p < procs; ++p) SendCnt[p] = 0;
num_diag_procs = SOLVEstruct->num_diag_procs;
diag_procs = SOLVEstruct->diag_procs;
 
for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */
pkk = diag_procs[p];
if ( iam == pkk ) {
for (k = p; k < nsupers; k += num_diag_procs) {
knsupc = SuperSize( k );
lk = LBi( k, grid ); /* Local block number */
irow = FstBlockC( k );
for (i = 0; i < knsupc; ++i) {
#if 0
q = row_to_proc[inv_perm_c[irow]];
#else
q = row_to_proc[irow];
#endif
++SendCnt[q];
++irow;
}
}
}
}
 
MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
sdispls[0] = rdispls[0] = 0;
sdispls_nrhs[0] = rdispls_nrhs[0] = 0;
SendCnt_nrhs[0] = SendCnt[0] * nrhs;
RecvCnt_nrhs[0] = RecvCnt[0] * nrhs;
for (p = 1; p < procs; ++p) {
sdispls[p] = sdispls[p-1] + SendCnt[p-1];
rdispls[p] = rdispls[p-1] + RecvCnt[p-1];
sdispls_nrhs[p] = sdispls[p] * nrhs;
rdispls_nrhs[p] = rdispls[p] * nrhs;
SendCnt_nrhs[p] = SendCnt[p] * nrhs;
RecvCnt_nrhs[p] = RecvCnt[p] * nrhs;
}
 
/* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/
gstrs_comm->X_to_B_SendCnt = SendCnt;
 
if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) )
ABORT("Malloc fails for ptr_to_ibuf[].");
gstrs_comm->ptr_to_ibuf = ptr_to_ibuf;
gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs;
 
} /* PXGSTRS_INIT */
 
 
void pxgstrs_finalize(pxgstrs_comm_t *gstrs_comm)
{
SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt);
SUPERLU_FREE(gstrs_comm->X_to_B_SendCnt);
SUPERLU_FREE(gstrs_comm->ptr_to_ibuf);
SUPERLU_FREE(gstrs_comm);
}
 
 
/*! \brief Diagnostic print of segment info after panel_dfs().
*/
void print_panel_seg_dist(int_t n, int_t w, int_t jcol, int_t nseg,
int_t *segrep, int_t *repfnz)
{
int_t j, k;
for (j = jcol; j < jcol+w; j++) {
printf("\tcol %d:\n", j);
for (k = 0; k < nseg; k++)
printf("\t\tseg %d, segrep %d, repfnz %d\n", k,
segrep[k], repfnz[(j-jcol)*n + segrep[k]]);
}
 
}
 
void
PStatInit(SuperLUStat_t *stat)
{
register int_t i;
 
if ( !(stat->utime = SUPERLU_MALLOC(NPHASES*sizeof(double))) )
ABORT("Malloc fails for stat->utime[]");
if ( !(stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t))) )
ABORT("SUPERLU_MALLOC fails for stat->ops[]");
for (i = 0; i < NPHASES; ++i) {
stat->utime[i] = 0.;
stat->ops[i] = 0.;
}
stat->TinyPivots = stat->RefineSteps = 0;
}
 
void
PStatPrint(superlu_options_t *options, SuperLUStat_t *stat, gridinfo_t *grid)
{
double *utime = stat->utime;
flops_t *ops = stat->ops;
int_t iam = grid->iam;
flops_t flopcnt, factflop, solveflop;
 
if ( options->PrintStat == NO ) return;
 
if ( !iam && options->Fact != FACTORED ) {
if ( options->Equil != NO )
printf("\tEQUIL time %8.2f\n", utime[EQUIL]);
if ( options->RowPerm != NOROWPERM )
printf("\tROWPERM time %8.2f\n", utime[ROWPERM]);
if ( options->ColPerm != NATURAL )
printf("\tCOLPERM time %8.2f\n", utime[COLPERM]);
printf("\tSYMBFACT time %8.2f\n", utime[SYMBFAC]);
printf("\tDISTRIBUTE time %8.2f\n", utime[DIST]);
 
}
 
MPI_Reduce(&ops[FACT], &flopcnt, 1, MPI_FLOAT, MPI_SUM,
0, grid->comm);
factflop = flopcnt;
if ( !iam && options->Fact != FACTORED ) {
printf("\tFACTOR time %8.2f\n", utime[FACT]);
if ( utime[FACT] != 0.0 )
printf("\tFactor flops\t%e\tMflops \t%8.2f\n",
flopcnt,
flopcnt*1e-6/utime[FACT]);
}
MPI_Reduce(&ops[SOLVE], &flopcnt, 1, MPI_FLOAT, MPI_SUM,
0, grid->comm);
solveflop = flopcnt;
if ( !iam ) {
printf("\tSOLVE time %8.2f\n", utime[SOLVE]);
if ( utime[SOLVE] != 0.0 )
printf("\tSolve flops\t%e\tMflops \t%8.2f\n",
flopcnt,
flopcnt*1e-6/utime[SOLVE]);
}
if ( !iam && options->IterRefine != NOREFINE ) {
printf("\tREFINEMENT time %8.2f\tSteps%8d\n\n",
utime[REFINE], stat->RefineSteps);
}
 
#if ( PROFlevel>=1 )
fflush(stdout);
MPI_Barrier( grid->comm );
 
{
int_t i, P = grid->nprow*grid->npcol;
flops_t b, maxflop;
if ( !iam ) printf("\n.. FACT time breakdown:\tcomm\ttotal\n");
for (i = 0; i < P; ++i) {
if ( iam == i) {
printf("\t\t(%d)%8.2f%8.2f\n", iam, utime[COMM], utime[FACT]);
fflush(stdout);
}
MPI_Barrier( grid->comm );
}
if ( !iam ) printf("\n.. FACT ops distribution:\n");
for (i = 0; i < P; ++i) {
if ( iam == i ) {
printf("\t\t(%d)\t%e\n", iam, ops[FACT]);
fflush(stdout);
}
MPI_Barrier( grid->comm );
}
MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm);
if ( !iam ) {
b = factflop/P/maxflop;
printf("\tFACT load balance: %.2f\n", b);
}
if ( !iam ) printf("\n.. SOLVE ops distribution:\n");
for (i = 0; i < P; ++i) {
if ( iam == i ) {
printf("\t\t%d\t%e\n", iam, ops[SOLVE]);
fflush(stdout);
}
MPI_Barrier( grid->comm );
}
MPI_Reduce(&ops[SOLVE], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0,grid->comm);
if ( !iam ) {
b = solveflop/P/maxflop;
printf("\tSOLVE load balance: %.2f\n", b);
}
}
#endif
 
/* if ( !iam ) fflush(stdout); CRASH THE SYSTEM pierre. */
}
 
void
PStatFree(SuperLUStat_t *stat)
{
SUPERLU_FREE(stat->utime);
SUPERLU_FREE(stat->ops);
}
 
/*! \brief Fills an integer array with a given value.
*/
void ifill_dist(int_t *a, int_t alen, int_t ival)
{
register int_t i;
for (i = 0; i < alen; i++) a[i] = ival;
}
 
 
void
get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid,
int_t *num_diag_procs, int_t **diag_procs, int_t **diag_len)
{
int_t i, j, k, knsupc, nprow, npcol, nsupers, pkk;
int_t *xsup;
 
i = j = *num_diag_procs = pkk = 0;
nprow = grid->nprow;
npcol = grid->npcol;
nsupers = Glu_persist->supno[n-1] + 1;
xsup = Glu_persist->xsup;
 
do {
++(*num_diag_procs);
i = (++i) % nprow;
j = (++j) % npcol;
pkk = PNUM( i, j, grid );
} while ( pkk != 0 ); /* Until wrap back to process 0 */
if ( !(*diag_procs = intMalloc_dist(*num_diag_procs)) )
ABORT("Malloc fails for diag_procs[]");
if ( !(*diag_len = intCalloc_dist(*num_diag_procs)) )
ABORT("Calloc fails for diag_len[]");
for (i = j = k = 0; k < *num_diag_procs; ++k) {
pkk = PNUM( i, j, grid );
(*diag_procs)[k] = pkk;
i = (++i) % nprow;
j = (++j) % npcol;
}
for (k = 0; k < nsupers; ++k) {
knsupc = SuperSize( k );
i = k % *num_diag_procs;
(*diag_len)[i] += knsupc;
}
}
 
 
/*! \brief Get the statistics of the supernodes
*/
#define NBUCKS 10
static int_t max_sup_size;
 
void super_stats_dist(int_t nsuper, int_t *xsup)
{
register int_t nsup1 = 0;
int_t i, isize, whichb, bl, bh;
int_t bucket[NBUCKS];
 
max_sup_size = 0;
 
for (i = 0; i <= nsuper; i++) {
isize = xsup[i+1] - xsup[i];
if ( isize == 1 ) nsup1++;
if ( max_sup_size < isize ) max_sup_size = isize;
}
 
printf(" Supernode statistics:\n\tno of super = %d\n", nsuper+1);
printf("\tmax supernode size = %d\n", max_sup_size);
printf("\tno of size 1 supernodes = %d\n", nsup1);
 
/* Histogram of the supernode sizes */
ifill_dist (bucket, NBUCKS, 0);
 
for (i = 0; i <= nsuper; i++) {
isize = xsup[i+1] - xsup[i];
whichb = (float) isize / max_sup_size * NBUCKS;
if (whichb >= NBUCKS) whichb = NBUCKS - 1;
bucket[whichb]++;
}
printf("\tHistogram of supernode sizes:\n");
for (i = 0; i < NBUCKS; i++) {
bl = (float) i * max_sup_size / NBUCKS;
bh = (float) (i+1) * max_sup_size / NBUCKS;
printf("\tsnode: %d-%d\t\t%d\n", bl+1, bh, bucket[i]);
}
 
}
 
 
/*! \brief Check whether repfnz[] == EMPTY after reset.
*/
void check_repfnz_dist(int_t n, int_t w, int_t jcol, int_t *repfnz)
{
int_t jj, k;
 
for (jj = jcol; jj < jcol+w; jj++)
for (k = 0; k < n; k++)
if ( repfnz[(jj-jcol)*n + k] != EMPTY ) {
fprintf(stderr, "col %d, repfnz_col[%d] = %d\n", jj,
k, repfnz[(jj-jcol)*n + k]);
ABORT("check_repfnz_dist");
}
}
 
void PrintInt10(char *name, int_t len, int_t *x)
{
register int_t i;
printf("%10s:", name);
for (i = 0; i < len; ++i) {
if ( i % 10 == 0 ) printf("\n\t[%2d-%2d]", i, i+9);
printf("%6d", x[i]);
}
printf("\n");
}
 
int file_PrintInt10(FILE *fp, char *name, int_t len, int_t *x)
{
register int_t i;
fprintf(fp, "%10s:", name);
for (i = 0; i < len; ++i) {
if ( i % 10 == 0 ) fprintf(fp, "\n\t[%2d-%2d]", i, i+9);
fprintf(fp, "%6d", x[i]);
}
fprintf(fp, "\n");
}
 
int_t
CheckZeroDiagonal(int_t n, int_t *rowind, int_t *colbeg, int_t *colcnt)
{
register int_t i, j, zd, numzd = 0;
 
for (j = 0; j < n; ++j) {
zd = 0;
for (i = colbeg[j]; i < colbeg[j]+colcnt[j]; ++i) {
/*if ( iperm[rowind[i]] == j ) zd = 1;*/
if ( rowind[i] == j ) { zd = 1; break; }
}
if ( zd == 0 ) {
#if ( PRNTlevel>=2 )
printf(".. Diagonal of column %d is zero.\n", j);
#endif
++numzd;
}
}
 
return numzd;
}
 
/trunk/OTHER/SuperLU_DIST_2.5/SRC/mc64ad.c
New file
0,0 → 1,2641
/* mc64ad.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
 
http://www.netlib.org/f2c/libf2c.zip
*/
 
#include "superlu_ddefs.h"
 
#define abs(x) ((x) >= 0 ? (x) : -(x))
#define min(a,b) ((a) < (b)) ? (a) : (b)
 
/* Table of constant values */
 
static int_t c__1 = 1;
static int_t c__2 = 2;
 
/* CCCC COPYRIGHT (c) 1999 Council for the Central Laboratory of the */
/* CCCC Research Councils. All rights reserved. */
/* CCCC PACKAGE MC64A/AD */
/* CCCC AUTHORS Iain Duff (i.duff@rl.ac.uk) and Jacko Koster (jak@ii.uib.no) */
/* CCCC LAST UPDATE 20/09/99 */
/* CCCC */
/* *** Conditions on external use *** */
 
/* The user shall acknowledge the contribution of this */
/* package in any publication of material dependent upon the use of */
/* the package. The user shall use reasonable endeavours to notify */
/* the authors of the package of this publication. */
 
/* The user can modify this code but, at no time */
/* shall the right or title to all or any part of this package pass */
/* to the user. The user shall make available free of charge */
/* to the authors for any purpose all information relating to any */
/* alteration or addition made to this package for the purposes of */
/* extending the capabilities or enhancing the performance of this */
/* package. */
 
/* The user shall not pass this code directly to a third party without the */
/* express prior consent of the authors. Users wanting to licence their */
/* own copy of these routines should send email to hsl@aeat.co.uk */
 
/* None of the comments from the Copyright notice up to and including this */
/* one shall be removed or altered in any way. */
/* ********************************************************************** */
/* Subroutine */ int_t mc64id_(int_t *icntl)
{
int_t i__;
 
 
/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
/* Research Councils *** */
/* *** Although every effort has been made to ensure robustness and *** */
/* *** reliability of the subroutines in this MC64 suite, we *** */
/* *** disclaim any liability arising through the use or misuse of *** */
/* *** any of the subroutines. *** */
/* *** Any problems? Contact ... */
/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */
 
/* Purpose */
/* ======= */
 
/* The components of the array ICNTL control the action of MC64A/AD. */
/* Default values for these are set in this subroutine. */
 
/* Parameters */
/* ========== */
 
 
/* Local variables */
 
/* ICNTL(1) has default value 6. */
/* It is the output stream for error messages. If it */
/* is negative, these messages will be suppressed. */
 
/* ICNTL(2) has default value 6. */
/* It is the output stream for warning messages. */
/* If it is negative, these messages are suppressed. */
 
/* ICNTL(3) has default value -1. */
/* It is the output stream for monitoring printing. */
/* If it is negative, these messages are suppressed. */
 
/* ICNTL(4) has default value 0. */
/* If left at the defaut value, the incoming data is checked for */
/* out-of-range indices and duplicates. Setting ICNTL(4) to any */
/* other will avoid the checks but is likely to cause problems */
/* later if out-of-range indices or duplicates are present. */
/* The user should only set ICNTL(4) non-zero, if the data is */
/* known to avoid these problems. */
 
/* ICNTL(5) to ICNTL(10) are not used by MC64A/AD but are set to */
/* zero in this routine. */
/* Initialization of the ICNTL array. */
/* Parameter adjustments */
--icntl;
 
/* Function Body */
icntl[1] = 6;
icntl[2] = 6;
icntl[3] = -1;
for (i__ = 4; i__ <= 10; ++i__) {
icntl[i__] = 0;
/* L10: */
}
return 0;
} /* mc64id_ */
 
/* ********************************************************************** */
/* Subroutine */ int_t mc64ad_(int_t *job, int_t *n, int_t *ne, int_t *
ip, int_t *irn, double *a, int_t *num, int_t *cperm,
int_t *liw, int_t *iw, int_t *ldw, double *dw, int_t *
icntl, int_t *info)
{
/* System generated locals */
int_t i__1, i__2;
double d__1, d__2;
 
/* Builtin functions */
double log(double);
 
/* Local variables */
int_t i__, j, k;
double fact, rinf;
 
extern /* Subroutine */ int_t mc21ad_(int_t *, int_t *, int_t *,
int_t *, int_t *, int_t *, int_t *, int_t *), mc64bd_(
int_t *, int_t *, int_t *, int_t *, double *, int_t
*, int_t *, int_t *, int_t *, int_t *, int_t *,
double *), mc64rd_(int_t *, int_t *, int_t *, int_t *,
double *), mc64sd_(int_t *, int_t *, int_t *, int_t *
, double *, int_t *, int_t *, int_t *, int_t *,
int_t *, int_t *, int_t *, int_t *, int_t *), mc64wd_(
int_t *, int_t *, int_t *, int_t *, double *, int_t
*, int_t *, int_t *, int_t *, int_t *, int_t *, int_t
*, double *, double *);
 
/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
/* Research Councils *** */
/* *** Although every effort has been made to ensure robustness and *** */
/* *** reliability of the subroutines in this MC64 suite, we *** */
/* *** disclaim any liability arising through the use or misuse of *** */
/* *** any of the subroutines. *** */
/* *** Any problems? Contact ... */
/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */
 
/* Purpose */
/* ======= */
 
/* This subroutine attempts to find a column permutation for an NxN */
/* sparse matrix A = {a_ij} that makes the permuted matrix have N */
/* entries on its diagonal. */
/* If the matrix is structurally nonsingular, the subroutine optionally */
/* returns a column permutation that maximizes the smallest element */
/* on the diagonal, maximizes the sum of the diagonal entries, or */
/* maximizes the product of the diagonal entries of the permuted matrix. */
/* For the latter option, the subroutine also finds scaling factors */
/* that may be used to scale the matrix so that the nonzero diagonal */
/* entries of the permuted matrix are one in absolute value and all the */
/* off-diagonal entries are less than or equal to one in absolute value. */
/* The natural logarithms of the scaling factors u(i), i=1..N, for the */
/* rows and v(j), j=1..N, for the columns are returned so that the */
/* scaled matrix B = {b_ij} has entries b_ij = a_ij * EXP(u_i + v_j). */
 
/* Parameters */
/* ========== */
 
 
/* JOB is an INT_T variable which must be set by the user to */
/* control the action. It is not altered by the subroutine. */
/* Possible values for JOB are: */
/* 1 Compute a column permutation of the matrix so that the */
/* permuted matrix has as many entries on its diagonal as possible. */
/* The values on the diagonal are of arbitrary size. HSL subroutine */
/* MC21A/AD is used for this. See [1]. */
/* 2 Compute a column permutation of the matrix so that the smallest */
/* value on the diagonal of the permuted matrix is maximized. */
/* See [3]. */
/* 3 Compute a column permutation of the matrix so that the smallest */
/* value on the diagonal of the permuted matrix is maximized. */
/* The algorithm differs from the one used for JOB = 2 and may */
/* have quite a different performance. See [2]. */
/* 4 Compute a column permutation of the matrix so that the sum */
/* of the diagonal entries of the permuted matrix is maximized. */
/* See [3]. */
/* 5 Compute a column permutation of the matrix so that the product */
/* of the diagonal entries of the permuted matrix is maximized */
/* and vectors to scale the matrix so that the nonzero diagonal */
/* entries of the permuted matrix are one in absolute value and */
/* all the off-diagonal entries are less than or equal to one in */
/* absolute value. See [3]. */
/* Restriction: 1 <= JOB <= 5. */
 
/* N is an INT_T variable which must be set by the user to the */
/* order of the matrix A. It is not altered by the subroutine. */
/* Restriction: N >= 1. */
 
/* NE is an INT_T variable which must be set by the user to the */
/* number of entries in the matrix. It is not altered by the */
/* subroutine. */
/* Restriction: NE >= 1. */
 
/* IP is an INT_T array of length N+1. */
/* IP(J), J=1..N, must be set by the user to the position in array IRN */
/* of the first row index of an entry in column J. IP(N+1) must be set */
/* to NE+1. It is not altered by the subroutine. */
 
/* IRN is an INT_T array of length NE. */
/* IRN(K), K=1..NE, must be set by the user to hold the row indices of */
/* the entries of the matrix. Those belonging to column J must be */
/* stored contiguously in the positions IP(J)..IP(J+1)-1. The ordering */
/* of the row indices within each column is unimportant. Repeated */
/* entries are not allowed. The array IRN is not altered by the */
/* subroutine. */
 
/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */
/* The user must set A(K), K=1..NE, to the numerical value of the */
/* entry that corresponds to IRN(K). */
/* It is not used by the subroutine when JOB = 1. */
/* It is not altered by the subroutine. */
 
/* NUM is an INT_T variable that need not be set by the user. */
/* On successful exit, NUM will be the number of entries on the */
/* diagonal of the permuted matrix. */
/* If NUM < N, the matrix is structurally singular. */
 
/* CPERM is an INT_T array of length N that need not be set by the */
/* user. On successful exit, CPERM contains the column permutation. */
/* Column CPERM(J) of the original matrix is column J in the permuted */
/* matrix, J=1..N. */
 
/* LIW is an INT_T variable that must be set by the user to */
/* the dimension of array IW. It is not altered by the subroutine. */
/* Restriction: */
/* JOB = 1 : LIW >= 5N */
/* JOB = 2 : LIW >= 4N */
/* JOB = 3 : LIW >= 10N + NE */
/* JOB = 4 : LIW >= 5N */
/* JOB = 5 : LIW >= 5N */
 
/* IW is an INT_T array of length LIW that is used for workspace. */
 
/* LDW is an INT_T variable that must be set by the user to the */
/* dimension of array DW. It is not altered by the subroutine. */
/* Restriction: */
/* JOB = 1 : LDW is not used */
/* JOB = 2 : LDW >= N */
/* JOB = 3 : LDW >= NE */
/* JOB = 4 : LDW >= 2N + NE */
/* JOB = 5 : LDW >= 3N + NE */
 
/* DW is a REAL (DOUBLE PRECISION in the D-version) array of length LDW */
/* that is used for workspace. If JOB = 5, on return, */
/* DW(i) contains u_i, i=1..N, and DW(N+j) contains v_j, j=1..N. */
 
/* ICNTL is an INT_T array of length 10. Its components control the */
/* output of MC64A/AD and must be set by the user before calling */
/* MC64A/AD. They are not altered by the subroutine. */
 
/* ICNTL(1) must be set to specify the output stream for */
/* error messages. If ICNTL(1) < 0, messages are suppressed. */
/* The default value set by MC46I/ID is 6. */
 
/* ICNTL(2) must be set by the user to specify the output stream for */
/* warning messages. If ICNTL(2) < 0, messages are suppressed. */
/* The default value set by MC46I/ID is 6. */
 
/* ICNTL(3) must be set by the user to specify the output stream for */
/* diagnostic messages. If ICNTL(3) < 0, messages are suppressed. */
/* The default value set by MC46I/ID is -1. */
 
/* ICNTL(4) must be set by the user to a value other than 0 to avoid */
/* checking of the input data. */
/* The default value set by MC46I/ID is 0. */
 
/* INFO is an INT_T array of length 10 which need not be set by the */
/* user. INFO(1) is set non-negative to indicate success. A negative */
/* value is returned if an error occurred, a positive value if a */
/* warning occurred. INFO(2) holds further information on the error. */
/* On exit from the subroutine, INFO(1) will take one of the */
/* following values: */
/* 0 : successful entry (for structurally nonsingular matrix). */
/* +1 : successful entry (for structurally singular matrix). */
/* +2 : the returned scaling factors are large and may cause */
/* overflow when used to scale the matrix. */
/* (For JOB = 5 entry only.) */
/* -1 : JOB < 1 or JOB > 5. Value of JOB held in INFO(2). */
/* -2 : N < 1. Value of N held in INFO(2). */
/* -3 : NE < 1. Value of NE held in INFO(2). */
/* -4 : the defined length LIW violates the restriction on LIW. */
/* Value of LIW required given by INFO(2). */
/* -5 : the defined length LDW violates the restriction on LDW. */
/* Value of LDW required given by INFO(2). */
/* -6 : entries are found whose row indices are out of range. INFO(2) */
/* contains the index of a column in which such an entry is found. */
/* -7 : repeated entries are found. INFO(2) contains the index of a */
/* column in which such entries are found. */
/* INFO(3) to INFO(10) are not currently used and are set to zero by */
/* the routine. */
 
/* References: */
/* [1] I. S. Duff, (1981), */
/* "Algorithm 575. Permutations for a zero-free diagonal", */
/* ACM Trans. Math. Software 7(3), 387-390. */
/* [2] I. S. Duff and J. Koster, (1998), */
/* "The design and use of algorithms for permuting large */
/* entries to the diagonal of sparse matrices", */
/* SIAM J. Matrix Anal. Appl., vol. 20, no. 4, pp. 889-901. */
/* [3] I. S. Duff and J. Koster, (1999), */
/* "On algorithms for permuting large entries to the diagonal */
/* of sparse matrices", */
/* Technical Report RAL-TR-1999-030, RAL, Oxfordshire, England. */
/* Local variables and parameters */
/* External routines and functions */
/* EXTERNAL FD05AD */
/* DOUBLE PRECISION FD05AD */
/* Intrinsic functions */
/* Set RINF to largest positive real number (infinity) */
/* XSL RINF = FD05AD(5) */
/* Parameter adjustments */
--cperm;
--ip;
--a;
--irn;
--iw;
--dw;
--icntl;
--info;
 
/* Function Body */
rinf = dlamch_("Overflow");
/* Check value of JOB */
if (*job < 1 || *job > 5) {
info[1] = -1;
info[2] = *job;
if (icntl[1] >= 0) {
printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1],
" because JOB = %d\n", *job);
}
goto L99;
}
/* Check value of N */
if (*n < 1) {
info[1] = -2;
info[2] = *n;
if (icntl[1] >= 0) {
printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1],
" because N = %d\n", *job);
}
goto L99;
}
/* Check value of NE */
if (*ne < 1) {
info[1] = -3;
info[2] = *ne;
if (icntl[1] >= 0) {
printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1],
" because NE = %d\n", *job);
}
goto L99;
}
/* Check LIW */
if (*job == 1) {
k = *n * 5;
}
if (*job == 2) {
k = *n << 2;
}
if (*job == 3) {
k = *n * 10 + *ne;
}
if (*job == 4) {
k = *n * 5;
}
if (*job == 5) {
k = *n * 5;
}
if (*liw < k) {
info[1] = -4;
info[2] = k;
if (icntl[1] >= 0) {
printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1],
" LIW too small, must be at least %8d\n", k);
}
goto L99;
}
/* Check LDW */
/* If JOB = 1, do not check */
if (*job > 1) {
if (*job == 2) {
k = *n;
}
if (*job == 3) {
k = *ne;
}
if (*job == 4) {
k = (*n << 1) + *ne;
}
if (*job == 5) {
k = *n * 3 + *ne;
}
if (*ldw < k) {
info[1] = -5;
info[2] = k;
if (icntl[1] >= 0) {
printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1],
" LDW too small, must be at least %8d\n", k);
}
goto L99;
}
}
if (icntl[4] == 0) {
/* Check row indices. Use IW(1:N) as workspace */
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
iw[i__] = 0;
/* L3: */
}
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = ip[j + 1] - 1;
for (k = ip[j]; k <= i__2; ++k) {
i__ = irn[k];
/* Check for row indices that are out of range */
if (i__ < 1 || i__ > *n) {
info[1] = -6;
info[2] = j;
if (icntl[1] >= 0) {
printf(" ****** Error in MC64A/AD. INFO(1) = %2d",
info[1], " Column %8d", j,
" contains an entry with invalid row index %8d\n", i__);
}
goto L99;
}
/* Check for repeated row indices within a column */
if (iw[i__] == j) {
info[1] = -7;
info[2] = j;
if (icntl[1] >= 0) {
printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1],
" Column %8d", j,
" contains two or more entries with row index %8d\n", i__);
}
goto L99;
} else {
iw[i__] = j;
}
/* L4: */
}
/* L6: */
}
}
/* Print diagnostics on input */
if (icntl[3] >= 0) {
printf(" ****** Input parameters for MC64A/AD: JOB = %8d,"
" N = %d, NE = %8d\n", *job, *n, *ne);
printf(" IP(1:N+1) = ");
for (j=1; j<=(*n+1); ++j) {
printf("%8d", ip[j]);
if (j%8 == 0) printf("\n");
}
printf("\n IRN(1:NE) = ");
for (j=1; j<=(*ne); ++j) {
printf("%8d", irn[j]);
if (j%8 == 0) printf("\n");
}
printf("\n");
 
if (*job > 1) {
printf(" A(1:NE) = ");
for (j=1; j<=(*ne); ++j) {
printf("%f14.4", a[j]);
if (j%4 == 0) printf("\n");
}
printf("\n");
}
}
/* Set components of INFO to zero */
for (i__ = 1; i__ <= 10; ++i__) {
info[i__] = 0;
/* L8: */
}
/* Compute maximum matching with MC21A/AD */
if (*job == 1) {
/* Put length of column J in IW(J) */
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
iw[j] = ip[j + 1] - ip[j];
/* L10: */
}
/* IW(N+1:5N) is workspace */
#if 0
mc21ad_(n, &irn[1], ne, &ip[1], &iw[1], &cperm[1], num, &iw[*n+1]);
#else
printf(" ****** Warning from MC64A/AD. Need to link mc21ad.\n");
#endif
goto L90;
}
/* Compute bottleneck matching */
if (*job == 2) {
/* IW(1:5N), DW(1:N) are workspaces */
mc64bd_(n, ne, &ip[1], &irn[1], &a[1], &cperm[1], num, &iw[1], &iw[*n
+ 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1], &dw[1]);
goto L90;
}
/* Compute bottleneck matching */
if (*job == 3) {
/* Copy IRN(K) into IW(K), ABS(A(K)) into DW(K), K=1..NE */
i__1 = *ne;
for (k = 1; k <= i__1; ++k) {
iw[k] = irn[k];
dw[k] = (d__1 = a[k], abs(d__1));
/* L20: */
}
/* Sort entries in each column by decreasing value. */
mc64rd_(n, ne, &ip[1], &iw[1], &dw[1]);
/* IW(NE+1:NE+10N) is workspace */
mc64sd_(n, ne, &ip[1], &iw[1], &dw[1], &cperm[1], num, &iw[*ne + 1], &
iw[*ne + *n + 1], &iw[*ne + (*n << 1) + 1], &iw[*ne + *n * 3
+ 1], &iw[*ne + (*n << 2) + 1], &iw[*ne + *n * 5 + 1], &iw[*
ne + *n * 6 + 1]);
goto L90;
}
if (*job == 4) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
fact = 0.;
i__2 = ip[j + 1] - 1;
for (k = ip[j]; k <= i__2; ++k) {
if ((d__1 = a[k], abs(d__1)) > fact) {
fact = (d__2 = a[k], abs(d__2));
}
/* L30: */
}
i__2 = ip[j + 1] - 1;
for (k = ip[j]; k <= i__2; ++k) {
dw[(*n << 1) + k] = fact - (d__1 = a[k], abs(d__1));
/* L40: */
}
/* L50: */
}
/* B = DW(2N+1:2N+NE); IW(1:5N) and DW(1:2N) are workspaces */
mc64wd_(n, ne, &ip[1], &irn[1], &dw[(*n << 1) + 1], &cperm[1], num, &
iw[1], &iw[*n + 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1], &iw[(
*n << 2) + 1], &dw[1], &dw[*n + 1]);
goto L90;
}
if (*job == 5) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
fact = 0.;
i__2 = ip[j + 1] - 1;
for (k = ip[j]; k <= i__2; ++k) {
dw[*n * 3 + k] = (d__1 = a[k], abs(d__1));
if (dw[*n * 3 + k] > fact) {
fact = dw[*n * 3 + k];
}
/* L60: */
}
dw[(*n << 1) + j] = fact;
if (fact != 0.) {
fact = log(fact);
} else {
fact = rinf / *n;
}
i__2 = ip[j + 1] - 1;
for (k = ip[j]; k <= i__2; ++k) {
if (dw[*n * 3 + k] != 0.) {
dw[*n * 3 + k] = fact - log(dw[*n * 3 + k]);
} else {
dw[*n * 3 + k] = rinf / *n;
}
/* L70: */
}
/* L75: */
}
/* B = DW(3N+1:3N+NE); IW(1:5N) and DW(1:2N) are workspaces */
mc64wd_(n, ne, &ip[1], &irn[1], &dw[*n * 3 + 1], &cperm[1], num, &iw[
1], &iw[*n + 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1], &iw[(*n
<< 2) + 1], &dw[1], &dw[*n + 1]);
if (*num == *n) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
if (dw[(*n << 1) + j] != 0.) {
dw[*n + j] -= log(dw[(*n << 1) + j]);
} else {
dw[*n + j] = 0.;
}
/* L80: */
}
}
/* Check size of scaling factors */
fact = log(rinf) * .5f;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
if (dw[j] < fact && dw[*n + j] < fact) {
goto L86;
}
info[1] = 2;
goto L90;
L86:
;
}
/* GO TO 90 */
}
L90:
if (info[1] == 0 && *num < *n) {
/* Matrix is structurally singular, return with warning */
info[1] = 1;
if (icntl[2] >= 0) {
printf(" ****** Warning from MC64A/AD. INFO(1) = %2d", info[1],
" The matrix is structurally singular.\n");
}
}
if (info[1] == 2) {
/* Scaling factors are large, return with warning */
if (icntl[2] >= 0) {
printf(" ****** Warning from MC64A/AD. INFO(1) = %2d\n", info[1],
" Some scaling factors may be too large.\n");
}
}
/* Print diagnostics on output */
if (icntl[3] >= 0) {
printf(" ****** Output parameters for MC64A/AD: INFO(1:2) = %8d%8d\n",
info[1], info[2]);
printf(" NUM = ", *num);
printf(" CPERM(1:N) = ");
for (j=1; j<=*n; ++j) {
printf("%8d", cperm[j]);
if (j%8 == 0) printf("\n");
}
if (*job == 5) {
printf("\n DW(1:N) = ");
for (j=1; j<=*n; ++j) {
printf("%11.3f", dw[j]);
if (j%5 == 0) printf("\n");
}
printf("\n DW(N+1:2N) = ");
for (j=1; j<=*n; ++j) {
printf("%11.3f", dw[*n+j]);
if (j%5 == 0) printf("\n");
}
printf("\n");
}
}
/* Return from subroutine. */
L99:
return 0;
} /* mc64ad_ */
 
/* ********************************************************************** */
/* Subroutine */ int_t mc64bd_(int_t *n, int_t *ne, int_t *ip, int_t *
irn, double *a, int_t *iperm, int_t *num, int_t *jperm,
int_t *pr, int_t *q, int_t *l, double *d__)
{
/* System generated locals */
int_t i__1, i__2, i__3;
double d__1, d__2, d__3;
 
/* Local variables */
int_t i__, j, k;
double a0;
int_t i0, q0;
double ai, di;
int_t ii, jj, kk;
double bv;
int_t up;
double dq0;
int_t kk1, kk2;
double csp;
int_t isp, jsp, low;
double dnew;
int_t jord, qlen, idum, jdum;
double rinf;
extern /* Subroutine */ int_t mc64dd_(int_t *, int_t *, int_t *,
double *, int_t *, int_t *), mc64ed_(int_t *, int_t *,
int_t *, double *, int_t *, int_t *), mc64fd_(int_t *
, int_t *, int_t *, int_t *, double *, int_t *, int_t *);
 
 
/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
/* Research Councils *** */
/* *** Although every effort has been made to ensure robustness and *** */
/* *** reliability of the subroutines in this MC64 suite, we *** */
/* *** disclaim any liability arising through the use or misuse of *** */
/* *** any of the subroutines. *** */
/* *** Any problems? Contact ... */
/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */
 
/* N, NE, IP, IRN are described in MC64A/AD. */
/* A is a REAL (DOUBLE PRECISION in the D-version) array of length */
/* NE. A(K), K=1..NE, must be set to the value of the entry */
/* that corresponds to IRN(K). It is not altered. */
/* IPERM is an INT_T array of length N. On exit, it contains the */
/* matching: IPERM(I) = 0 or row I is matched to column IPERM(I). */
/* NUM is INT_T variable. On exit, it contains the cardinality of the */
/* matching stored in IPERM. */
/* IW is an INT_T work array of length 4N. */
/* DW is a REAL (DOUBLE PRECISION in D-version) work array of length N. */
/* Local variables */
/* Local parameters */
/* Intrinsic functions */
/* External subroutines and/or functions */
/* EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD, DLAMCH */
/* DOUBLE PRECISION FD05AD, DLAMCH */
/* Set RINF to largest positive real number */
/* XSL RINF = FD05AD(5) */
/* Parameter adjustments */
--d__;
--l;
--q;
--pr;
--jperm;
--iperm;
--ip;
--a;
--irn;
 
/* Function Body */
rinf = dlamch_("Overflow");
/* Initialization */
*num = 0;
bv = rinf;
i__1 = *n;
for (k = 1; k <= i__1; ++k) {
iperm[k] = 0;
jperm[k] = 0;
pr[k] = ip[k];
d__[k] = 0.;
/* L10: */
}
/* Scan columns of matrix; */
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
a0 = -1.;
i__2 = ip[j + 1] - 1;
for (k = ip[j]; k <= i__2; ++k) {
i__ = irn[k];
ai = (d__1 = a[k], abs(d__1));
if (ai > d__[i__]) {
d__[i__] = ai;
}
if (jperm[j] != 0) {
goto L30;
}
if (ai >= bv) {
a0 = bv;
if (iperm[i__] != 0) {
goto L30;
}
jperm[j] = i__;
iperm[i__] = j;
++(*num);
} else {
if (ai <= a0) {
goto L30;
}
a0 = ai;
i0 = i__;
}
L30:
;
}
if (a0 != -1. && a0 < bv) {
bv = a0;
if (iperm[i0] != 0) {
goto L20;
}
iperm[i0] = j;
jperm[j] = i0;
++(*num);
}
L20:
;
}
/* Update BV with smallest of all the largest maximum absolute values */
/* of the rows. */
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
/* Computing MIN */
d__1 = bv, d__2 = d__[i__];
bv = min(d__1,d__2);
/* L25: */
}
if (*num == *n) {
goto L1000;
}
/* Rescan unassigned columns; improve initial assignment */
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
if (jperm[j] != 0) {
goto L95;
}
i__2 = ip[j + 1] - 1;
for (k = ip[j]; k <= i__2; ++k) {
i__ = irn[k];
ai = (d__1 = a[k], abs(d__1));
if (ai < bv) {
goto L50;
}
if (iperm[i__] == 0) {
goto L90;
}
jj = iperm[i__];
kk1 = pr[jj];
kk2 = ip[jj + 1] - 1;
if (kk1 > kk2) {
goto L50;
}
i__3 = kk2;
for (kk = kk1; kk <= i__3; ++kk) {
ii = irn[kk];
if (iperm[ii] != 0) {
goto L70;
}
if ((d__1 = a[kk], abs(d__1)) >= bv) {
goto L80;
}
L70:
;
}
pr[jj] = kk2 + 1;
L50:
;
}
goto L95;
L80:
jperm[jj] = ii;
iperm[ii] = jj;
pr[jj] = kk + 1;
L90:
++(*num);
jperm[j] = i__;
iperm[i__] = j;
pr[j] = k + 1;
L95:
;
}
if (*num == *n) {
goto L1000;
}
/* Prepare for main loop */
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
d__[i__] = -1.;
l[i__] = 0;
/* L99: */
}
/* Main loop ... each pass round this loop is similar to Dijkstra's */
/* algorithm for solving the single source shortest path problem */
i__1 = *n;
for (jord = 1; jord <= i__1; ++jord) {
if (jperm[jord] != 0) {
goto L100;
}
qlen = 0;
low = *n + 1;
up = *n + 1;
/* CSP is cost of shortest path to any unassigned row */
/* ISP is matrix position of unassigned row element in shortest path */
/* JSP is column index of unassigned row element in shortest path */
csp = -1.;
/* Build shortest path tree starting from unassigned column JORD */
j = jord;
pr[j] = -1;
/* Scan column J */
i__2 = ip[j + 1] - 1;
for (k = ip[j]; k <= i__2; ++k) {
i__ = irn[k];
dnew = (d__1 = a[k], abs(d__1));
if (csp >= dnew) {
goto L115;
}
if (iperm[i__] == 0) {
/* Row I is unassigned; update shortest path info */
csp = dnew;
isp = i__;
jsp = j;
if (csp >= bv) {
goto L160;
}
} else {
d__[i__] = dnew;
if (dnew >= bv) {
/* Add row I to Q2 */
--low;
q[low] = i__;
} else {
/* Add row I to Q, and push it */
++qlen;
l[i__] = qlen;
mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__1);
}
jj = iperm[i__];
pr[jj] = j;
}
L115:
;
}
i__2 = *num;
for (jdum = 1; jdum <= i__2; ++jdum) {
/* If Q2 is empty, extract new rows from Q */
if (low == up) {
if (qlen == 0) {
goto L160;
}
i__ = q[1];
if (csp >= d__[i__]) {
goto L160;
}
bv = d__[i__];
i__3 = *n;
for (idum = 1; idum <= i__3; ++idum) {
mc64ed_(&qlen, n, &q[1], &d__[1], &l[1], &c__1);
l[i__] = 0;
--low;
q[low] = i__;
if (qlen == 0) {
goto L153;
}
i__ = q[1];
if (d__[i__] != bv) {
goto L153;
}
/* L152: */
}
/* End of dummy loop; this point is never reached */
}
/* Move row Q0 */
L153:
--up;
q0 = q[up];
dq0 = d__[q0];
l[q0] = up;
/* Scan column that matches with row Q0 */
j = iperm[q0];
i__3 = ip[j + 1] - 1;
for (k = ip[j]; k <= i__3; ++k) {
i__ = irn[k];
/* Update D(I) */
if (l[i__] >= up) {
goto L155;
}
/* Computing MIN */
d__2 = dq0, d__3 = (d__1 = a[k], abs(d__1));
dnew = min(d__2,d__3);
if (csp >= dnew) {
goto L155;
}
if (iperm[i__] == 0) {
/* Row I is unassigned; update shortest path info */
csp = dnew;
isp = i__;
jsp = j;
if (csp >= bv) {
goto L160;
}
} else {
di = d__[i__];
if (di >= bv || di >= dnew) {
goto L155;
}
d__[i__] = dnew;
if (dnew >= bv) {
/* Delete row I from Q (if necessary); add row I to Q2 */
if (di != -1.) {
mc64fd_(&l[i__], &qlen, n, &q[1], &d__[1], &l[1],
&c__1);
}
l[i__] = 0;
--low;
q[low] = i__;
} else {
/* Add row I to Q (if necessary); push row I up Q */
if (di == -1.) {
++qlen;
l[i__] = qlen;
}
mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__1);
}
/* Update tree */
jj = iperm[i__];
pr[jj] = j;
}
L155:
;
}
/* L150: */
}
/* If CSP = MINONE, no augmenting path is found */
L160:
if (csp == -1.) {
goto L190;
}
/* Update bottleneck value */
bv = min(bv,csp);
/* Find augmenting path by tracing backward in PR; update IPERM,JPERM */
++(*num);
i__ = isp;
j = jsp;
i__2 = *num + 1;
for (jdum = 1; jdum <= i__2; ++jdum) {
i0 = jperm[j];
jperm[j] = i__;
iperm[i__] = j;
j = pr[j];
if (j == -1) {
goto L190;
}
i__ = i0;
/* L170: */
}
/* End of dummy loop; this point is never reached */
L190:
i__2 = *n;
for (kk = up; kk <= i__2; ++kk) {
i__ = q[kk];
d__[i__] = -1.;
l[i__] = 0;
/* L191: */
}
i__2 = up - 1;
for (kk = low; kk <= i__2; ++kk) {
i__ = q[kk];
d__[i__] = -1.;
/* L192: */
}
i__2 = qlen;
for (kk = 1; kk <= i__2; ++kk) {
i__ = q[kk];
d__[i__] = -1.;
l[i__] = 0;
/* L193: */
}
L100:
;
}
/* End of main loop */
/* BV is bottleneck value of final matching */
if (*num == *n) {
goto L1000;
}
/* Matrix is structurally singular, complete IPERM. */
/* JPERM, PR are work arrays */
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
jperm[j] = 0;
/* L300: */
}
k = 0;
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
if (iperm[i__] == 0) {
++k;
pr[k] = i__;
} else {
j = iperm[i__];
jperm[j] = i__;
}
/* L310: */
}
k = 0;
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
if (jperm[i__] != 0) {
goto L320;
}
++k;
jdum = pr[k];
iperm[jdum] = i__;
L320:
;
}
L1000:
return 0;
} /* mc64bd_ */
 
/* ********************************************************************** */
/* Subroutine */ int_t mc64dd_(int_t *i__, int_t *n, int_t *q, double
*d__, int_t *l, int_t *iway)
{
/* System generated locals */
int_t i__1;
 
/* Local variables */
double di;
int_t qk, pos, idum, posk;
 
 
/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
/* Research Councils *** */
/* *** Although every effort has been made to ensure robustness and *** */
/* *** reliability of the subroutines in this MC64 suite, we *** */
/* *** disclaim any liability arising through the use or misuse of *** */
/* *** any of the subroutines. *** */
/* *** Any problems? Contact ... */
/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */
 
/* Variables N,Q,D,L are described in MC64B/BD */
/* IF IWAY is equal to 1, then */
/* node I is pushed from its current position upwards */
/* IF IWAY is not equal to 1, then */
/* node I is pushed from its current position downwards */
/* Local variables and parameters */
/* Parameter adjustments */
--l;
--d__;
--q;
 
/* Function Body */
di = d__[*i__];
pos = l[*i__];
/* POS is index of current position of I in the tree */
if (*iway == 1) {
i__1 = *n;
for (idum = 1; idum <= i__1; ++idum) {
if (pos <= 1) {
goto L20;
}
posk = pos / 2;
qk = q[posk];
if (di <= d__[qk]) {
goto L20;
}
q[pos] = qk;
l[qk] = pos;
pos = posk;
/* L10: */
}
/* End of dummy loop; this point is never reached */
} else {
i__1 = *n;
for (idum = 1; idum <= i__1; ++idum) {
if (pos <= 1) {
goto L20;
}
posk = pos / 2;
qk = q[posk];
if (di >= d__[qk]) {
goto L20;
}
q[pos] = qk;
l[qk] = pos;
pos = posk;
/* L15: */
}
/* End of dummy loop; this point is never reached */
}
/* End of dummy if; this point is never reached */
L20:
q[pos] = *i__;
l[*i__] = pos;
return 0;
} /* mc64dd_ */
 
/* ********************************************************************** */
/* Subroutine */ int_t mc64ed_(int_t *qlen, int_t *n, int_t *q,
double *d__, int_t *l, int_t *iway)
{
/* System generated locals */
int_t i__1;
 
/* Local variables */
int_t i__;
double di, dk, dr;
int_t pos, idum, posk;
 
 
/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
/* Research Councils *** */
/* *** Although every effort has been made to ensure robustness and *** */
/* *** reliability of the subroutines in this MC64 suite, we *** */
/* *** disclaim any liability arising through the use or misuse of *** */
/* *** any of the subroutines. *** */
/* *** Any problems? Contact ... */
/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */
 
/* Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or */
/* MC64W/WD (IWAY = 2) */
/* The root node is deleted from the binary heap. */
/* Local variables and parameters */
/* Move last element to begin of Q */
/* Parameter adjustments */
--l;
--d__;
--q;
 
/* Function Body */
i__ = q[*qlen];
di = d__[i__];
--(*qlen);
pos = 1;
if (*iway == 1) {
i__1 = *n;
for (idum = 1; idum <= i__1; ++idum) {
posk = pos << 1;
if (posk > *qlen) {
goto L20;
}
dk = d__[q[posk]];
if (posk < *qlen) {
dr = d__[q[posk + 1]];
if (dk < dr) {
++posk;
dk = dr;
}
}
if (di >= dk) {
goto L20;
}
/* Exchange old last element with larger priority child */
q[pos] = q[posk];
l[q[pos]] = pos;
pos = posk;
/* L10: */
}
/* End of dummy loop; this point is never reached */
} else {
i__1 = *n;
for (idum = 1; idum <= i__1; ++idum) {
posk = pos << 1;
if (posk > *qlen) {
goto L20;
}
dk = d__[q[posk]];
if (posk < *qlen) {
dr = d__[q[posk + 1]];
if (dk > dr) {
++posk;
dk = dr;
}
}
if (di <= dk) {
goto L20;
}
/* Exchange old last element with smaller child */
q[pos] = q[posk];
l[q[pos]] = pos;
pos = posk;
/* L15: */
}
/* End of dummy loop; this point is never reached */
}
/* End of dummy if; this point is never reached */
L20:
q[pos] = i__;
l[i__] = pos;
return 0;
} /* mc64ed_ */
 
/* ********************************************************************** */
/* Subroutine */ int_t mc64fd_(int_t *pos0, int_t *qlen, int_t *n,
int_t *q, double *d__, int_t *l, int_t *iway)
{
/* System generated locals */
int_t i__1;
 
/* Local variables */
int_t i__;
double di, dk, dr;
int_t qk, pos, idum, posk;
 
 
/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
/* Research Councils *** */
/* *** Although every effort has been made to ensure robustness and *** */
/* *** reliability of the subroutines in this MC64 suite, we *** */
/* *** disclaim any liability arising through the use or misuse of *** */
/* *** any of the subroutines. *** */
/* *** Any problems? Contact ... */
/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */
 
/* Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or */
/* MC64WD (IWAY = 2). */
/* Move last element in the heap */
/* Quick return, if possible */
/* Parameter adjustments */
--l;
--d__;
--q;
 
/* Function Body */
if (*qlen == *pos0) {
--(*qlen);
return 0;
}
/* Move last element from queue Q to position POS0 */
/* POS is current position of node I in the tree */
i__ = q[*qlen];
di = d__[i__];
--(*qlen);
pos = *pos0;
if (*iway == 1) {
i__1 = *n;
for (idum = 1; idum <= i__1; ++idum) {
if (pos <= 1) {
goto L20;
}
posk = pos / 2;
qk = q[posk];
if (di <= d__[qk]) {
goto L20;
}
q[pos] = qk;
l[qk] = pos;
pos = posk;
/* L10: */
}
/* End of dummy loop; this point is never reached */
L20:
q[pos] = i__;
l[i__] = pos;
i__1 = *n;
for (idum = 1; idum <= i__1; ++idum) {
posk = pos << 1;
if (posk > *qlen) {
goto L40;
}
dk = d__[q[posk]];
if (posk < *qlen) {
dr = d__[q[posk + 1]];
if (dk < dr) {
++posk;
dk = dr;
}
}
if (di >= dk) {
goto L40;
}
qk = q[posk];
q[pos] = qk;
l[qk] = pos;
pos = posk;
/* L30: */
}
/* End of dummy loop; this point is never reached */
} else {
i__1 = *n;
for (idum = 1; idum <= i__1; ++idum) {
if (pos <= 1) {
goto L34;
}
posk = pos / 2;
qk = q[posk];
if (di >= d__[qk]) {
goto L34;
}
q[pos] = qk;
l[qk] = pos;
pos = posk;
/* L32: */
}
/* End of dummy loop; this point is never reached */
L34:
q[pos] = i__;
l[i__] = pos;
i__1 = *n;
for (idum = 1; idum <= i__1; ++idum) {
posk = pos << 1;
if (posk > *qlen) {
goto L40;
}
dk = d__[q[posk]];
if (posk < *qlen) {
dr = d__[q[posk + 1]];
if (dk > dr) {
++posk;
dk = dr;
}
}
if (di <= dk) {
goto L40;
}
qk = q[posk];
q[pos] = qk;
l[qk] = pos;
pos = posk;
/* L36: */
}
/* End of dummy loop; this point is never reached */
}
/* End of dummy if; this point is never reached */
L40:
q[pos] = i__;
l[i__] = pos;
return 0;
} /* mc64fd_ */
 
/* ********************************************************************** */
/* Subroutine */ int_t mc64rd_(int_t *n, int_t *ne, int_t *ip, int_t *
irn, double *a)
{
/* System generated locals */
int_t i__1, i__2, i__3;
 
/* Local variables */
int_t j, k, r__, s;
double ha;
int_t hi, td, mid, len, ipj;
double key;
int_t last, todo[50], first;
 
 
/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
/* Research Councils *** */
/* *** Although every effort has been made to ensure robustness and *** */
/* *** reliability of the subroutines in this MC64 suite, we *** */
/* *** disclaim any liability arising through the use or misuse of *** */
/* *** any of the subroutines. *** */
/* *** Any problems? Contact ... */
/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */
 
/* This subroutine sorts the entries in each column of the */
/* sparse matrix (defined by N,NE,IP,IRN,A) by decreasing */
/* numerical value. */
/* Local constants */
/* Local variables */
/* Local arrays */
/* Parameter adjustments */
--ip;
--a;
--irn;
 
/* Function Body */
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
len = ip[j + 1] - ip[j];
if (len <= 1) {
goto L100;
}
ipj = ip[j];
/* Sort array roughly with partial quicksort */
if (len < 15) {
goto L400;
}
todo[0] = ipj;
todo[1] = ipj + len;
td = 2;
L500:
first = todo[td - 2];
last = todo[td - 1];
/* KEY is the smallest of two values present in interval [FIRST,LAST) */
key = a[(first + last) / 2];
i__2 = last - 1;
for (k = first; k <= i__2; ++k) {
ha = a[k];
if (ha == key) {
goto L475;
}
if (ha > key) {
goto L470;
}
key = ha;
goto L470;
L475:
;
}
/* Only one value found in interval, so it is already sorted */
td += -2;
goto L425;
/* Reorder interval [FIRST,LAST) such that entries before MID are gt KEY */
L470:
mid = first;
i__2 = last - 1;
for (k = first; k <= i__2; ++k) {
if (a[k] <= key) {
goto L450;
}
ha = a[mid];
a[mid] = a[k];
a[k] = ha;
hi = irn[mid];
irn[mid] = irn[k];
irn[k] = hi;
++mid;
L450:
;
}
/* Both subintervals [FIRST,MID), [MID,LAST) are nonempty */
/* Stack the longest of the two subintervals first */
if (mid - first >= last - mid) {
todo[td + 1] = last;
todo[td] = mid;
todo[td - 1] = mid;
/* TODO(TD-1) = FIRST */
} else {
todo[td + 1] = mid;
todo[td] = first;
todo[td - 1] = last;
todo[td - 2] = mid;
}
td += 2;
L425:
if (td == 0) {
goto L400;
}
/* There is still work to be done */
if (todo[td - 1] - todo[td - 2] >= 15) {
goto L500;
}
/* Next interval is already short enough for straightforward insertion */
td += -2;
goto L425;
/* Complete sorting with straightforward insertion */
L400:
i__2 = ipj + len - 1;
for (r__ = ipj + 1; r__ <= i__2; ++r__) {
if (a[r__ - 1] < a[r__]) {
ha = a[r__];
hi = irn[r__];
a[r__] = a[r__ - 1];
irn[r__] = irn[r__ - 1];
i__3 = ipj + 1;
for (s = r__ - 1; s >= i__3; --s) {
if (a[s - 1] < ha) {
a[s] = a[s - 1];
irn[s] = irn[s - 1];
} else {
a[s] = ha;
irn[s] = hi;
goto L200;
}
/* L300: */
}
a[ipj] = ha;
irn[ipj] = hi;
}
L200:
;
}
L100:
;
}
return 0;
} /* mc64rd_ */
 
/* ********************************************************************** */
/* Subroutine */ int_t mc64sd_(int_t *n, int_t *ne, int_t *ip, int_t *
irn, double *a, int_t *iperm, int_t *numx, int_t *w,
int_t *len, int_t *lenl, int_t *lenh, int_t *fc, int_t *iw,
int_t *iw4)
{
/* System generated locals */
int_t i__1, i__2, i__3, i__4;
 
/* Local variables */
int_t i__, j, k, l, ii, mod, cnt, num;
double bval, bmin, bmax, rinf;
int_t nval, wlen, idum1, idum2, idum3;
extern /* Subroutine */ int_t mc64qd_(int_t *, int_t *, int_t *,
int_t *, int_t *, double *, int_t *, double *),
mc64ud_(int_t *, int_t *, int_t *, int_t *, int_t *,
int_t *, int_t *, int_t *, int_t *, int_t *, int_t *,
int_t *, int_t *, int_t *, int_t *);
 
/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
/* Research Councils *** */
/* *** Although every effort has been made to ensure robustness and *** */
/* *** reliability of the subroutines in this MC64 suite, we *** */
/* *** disclaim any liability arising through the use or misuse of *** */
/* *** any of the subroutines. *** */
/* *** Any problems? Contact ... */
/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */
 
/* N, NE, IP, IRN, are described in MC64A/AD. */
/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */
/* A(K), K=1..NE, must be set to the value of the entry that */
/* corresponds to IRN(k). The entries in each column must be */
/* non-negative and ordered by decreasing value. */
/* IPERM is an INT_T array of length N. On exit, it contains the */
/* bottleneck matching: IPERM(I) - 0 or row I is matched to column */
/* IPERM(I). */
/* NUMX is an INT_T variable. On exit, it contains the cardinality */
/* of the matching stored in IPERM. */
/* IW is an INT_T work array of length 10N. */
/* FC is an int_t array of length N that contains the list of */
/* unmatched columns. */
/* LEN(J), LENL(J), LENH(J) are int_t arrays of length N that point */
/* to entries in matrix column J. */
/* In the matrix defined by the column parts IP(J)+LENL(J) we know */
/* a matching does not exist; in the matrix defined by the column */
/* parts IP(J)+LENH(J) we know one exists. */
/* LEN(J) lies between LENL(J) and LENH(J) and determines the matrix */
/* that is tested for a maximum matching. */
/* W is an int_t array of length N and contains the indices of the */
/* columns for which LENL ne LENH. */
/* WLEN is number of indices stored in array W. */
/* IW is int_t work array of length N. */
/* IW4 is int_t work array of length 4N used by MC64U/UD. */
/* EXTERNAL FD05AD,MC64QD,MC64UD */
/* DOUBLE PRECISION FD05AD */
/* BMIN and BMAX are such that a maximum matching exists for the input */
/* matrix in which all entries smaller than BMIN are dropped. */
/* For BMAX, a maximum matching does not exist. */
/* BVAL is a value between BMIN and BMAX. */
/* CNT is the number of calls made to MC64U/UD so far. */
/* NUM is the cardinality of last matching found. */
/* Set RINF to largest positive real number */
/* XSL RINF = FD05AD(5) */
/* Parameter adjustments */
--iw4;
--iw;
--fc;
--lenh;
--lenl;
--len;
--w;
--iperm;
--ip;
--a;
--irn;
 
/* Function Body */
rinf = dlamch_("Overflow");
/* Compute a first maximum matching from scratch on whole matrix. */
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
fc[j] = j;
iw[j] = 0;
len[j] = ip[j + 1] - ip[j];
/* L20: */
}
/* The first call to MC64U/UD */
cnt = 1;
mod = 1;
*numx = 0;
mc64ud_(&cnt, &mod, n, &irn[1], ne, &ip[1], &len[1], &fc[1], &iw[1], numx,
n, &iw4[1], &iw4[*n + 1], &iw4[(*n << 1) + 1], &iw4[*n * 3 + 1]);
/* IW contains a maximum matching of length NUMX. */
num = *numx;
if (num != *n) {
/* Matrix is structurally singular */
bmax = rinf;
} else {
/* Matrix is structurally nonsingular, NUM=NUMX=N; */
/* Set BMAX just above the smallest of all the maximum absolute */
/* values of the columns */
bmax = rinf;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
bval = 0.f;
i__2 = ip[j + 1] - 1;
for (k = ip[j]; k <= i__2; ++k) {
if (a[k] > bval) {
bval = a[k];
}
/* L25: */
}
if (bval < bmax) {
bmax = bval;
}
/* L30: */
}
bmax *= 1.001f;
}
/* Initialize BVAL,BMIN */
bval = 0.f;
bmin = 0.f;
/* Initialize LENL,LEN,LENH,W,WLEN according to BMAX. */
/* Set LEN(J), LENH(J) just after last entry in column J. */
/* Set LENL(J) just after last entry in column J with value ge BMAX. */
wlen = 0;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
l = ip[j + 1] - ip[j];
lenh[j] = l;
len[j] = l;
i__2 = ip[j + 1] - 1;
for (k = ip[j]; k <= i__2; ++k) {
if (a[k] < bmax) {
goto L46;
}
/* L45: */
}
/* Column J is empty or all entries are ge BMAX */
k = ip[j + 1];
L46:
lenl[j] = k - ip[j];
/* Add J to W if LENL(J) ne LENH(J) */
if (lenl[j] == l) {
goto L48;
}
++wlen;
w[wlen] = j;
L48:
;
}
/* Main loop */
i__1 = *ne;
for (idum1 = 1; idum1 <= i__1; ++idum1) {
if (num == *numx) {
/* We have a maximum matching in IW; store IW in IPERM */
i__2 = *n;
for (i__ = 1; i__ <= i__2; ++i__) {
iperm[i__] = iw[i__];
/* L50: */
}
/* Keep going round this loop until matching IW is no longer maximum. */
i__2 = *ne;
for (idum2 = 1; idum2 <= i__2; ++idum2) {
bmin = bval;
if (bmax == bmin) {
goto L99;
}
/* Find splitting value BVAL */
mc64qd_(&ip[1], &lenl[1], &len[1], &w[1], &wlen, &a[1], &nval,
&bval);
if (nval <= 1) {
goto L99;
}
/* Set LEN such that all matrix entries with value lt BVAL are */
/* discarded. Store old LEN in LENH. Do this for all columns W(K). */
/* Each step, either K is incremented or WLEN is decremented. */
k = 1;
i__3 = *n;
for (idum3 = 1; idum3 <= i__3; ++idum3) {
if (k > wlen) {
goto L71;
}
j = w[k];
i__4 = ip[j] + lenl[j];
for (ii = ip[j] + len[j] - 1; ii >= i__4; --ii) {
if (a[ii] >= bval) {
goto L60;
}
i__ = irn[ii];
if (iw[i__] != j) {
goto L55;
}
/* Remove entry from matching */
iw[i__] = 0;
--num;
fc[*n - num] = j;
L55:
;
}
L60:
lenh[j] = len[j];
/* IP(J)+LEN(J)-1 is last entry in column ge BVAL */
len[j] = ii - ip[j] + 1;
/* If LENH(J) = LENL(J), remove J from W */
if (lenl[j] == lenh[j]) {
w[k] = w[wlen];
--wlen;
} else {
++k;
}
/* L70: */
}
L71:
if (num < *numx) {
goto L81;
}
/* L80: */
}
/* End of dummy loop; this point is never reached */
/* Set mode for next call to MC64U/UD */
L81:
mod = 1;
} else {
/* We do not have a maximum matching in IW. */
bmax = bval;
/* BMIN is the bottleneck value of a maximum matching; */
/* for BMAX the matching is not maximum, so BMAX>BMIN */
/* IF (BMAX .EQ. BMIN) GO TO 99 */
/* Find splitting value BVAL */
mc64qd_(&ip[1], &len[1], &lenh[1], &w[1], &wlen, &a[1], &nval, &
bval);
if (nval == 0 || bval == bmin) {
goto L99;
}
/* Set LEN such that all matrix entries with value ge BVAL are */
/* inside matrix. Store old LEN in LENL. Do this for all columns W(K). */
/* Each step, either K is incremented or WLEN is decremented. */
k = 1;
i__2 = *n;
for (idum3 = 1; idum3 <= i__2; ++idum3) {
if (k > wlen) {
goto L88;
}
j = w[k];
i__3 = ip[j] + lenh[j] - 1;
for (ii = ip[j] + len[j]; ii <= i__3; ++ii) {
if (a[ii] < bval) {
goto L86;
}
/* L85: */
}
L86:
lenl[j] = len[j];
len[j] = ii - ip[j];
if (lenl[j] == lenh[j]) {
w[k] = w[wlen];
--wlen;
} else {
++k;
}
/* L87: */
}
/* End of dummy loop; this point is never reached */
/* Set mode for next call to MC64U/UD */
L88:
mod = 0;
}
++cnt;
mc64ud_(&cnt, &mod, n, &irn[1], ne, &ip[1], &len[1], &fc[1], &iw[1], &
num, numx, &iw4[1], &iw4[*n + 1], &iw4[(*n << 1) + 1], &iw4[*
n * 3 + 1]);
/* IW contains maximum matching of length NUM */
/* L90: */
}
/* End of dummy loop; this point is never reached */
/* BMIN is bottleneck value of final matching */
L99:
if (*numx == *n) {
goto L1000;
}
/* The matrix is structurally singular, complete IPERM */
/* W, IW are work arrays */
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
w[j] = 0;
/* L300: */
}
k = 0;
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
if (iperm[i__] == 0) {
++k;
iw[k] = i__;
} else {
j = iperm[i__];
w[j] = i__;
}
/* L310: */
}
k = 0;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
if (w[j] != 0) {
goto L320;
}
++k;
idum1 = iw[k];
iperm[idum1] = j;
L320:
;
}
L1000:
return 0;
} /* mc64sd_ */
 
/* ********************************************************************** */
/* Subroutine */ int_t mc64qd_(int_t *ip, int_t *lenl, int_t *lenh,
int_t *w, int_t *wlen, double *a, int_t *nval, double *
val)
{
/* System generated locals */
int_t i__1, i__2, i__3;
 
/* Local variables */
int_t j, k, s;
double ha;
int_t ii, pos;
double split[10];
 
 
/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
/* Research Councils *** */
/* *** Although every effort has been made to ensure robustness and *** */
/* *** reliability of the subroutines in this MC64 suite, we *** */
/* *** disclaim any liability arising through the use or misuse of *** */
/* *** any of the subroutines. *** */
/* *** Any problems? Contact ... */
/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */
 
/* This routine searches for at most XX different numerical values */
/* in the columns W(1:WLEN). XX>=2. */
/* Each column J is scanned between IP(J)+LENL(J) and IP(J)+LENH(J)-1 */
/* until XX values are found or all columns have been considered. */
/* On output, NVAL is the number of different values that is found */
/* and SPLIT(1:NVAL) contains the values in decreasing order. */
/* If NVAL > 0, the routine returns VAL = SPLIT((NVAL+1)/2). */
 
/* Scan columns in W(1:WLEN). For each encountered value, if value not */
/* already present in SPLIT(1:NVAL), insert value such that SPLIT */
/* remains sorted by decreasing value. */
/* The sorting is done by straightforward insertion; therefore the use */
/* of this routine should be avoided for large XX (XX < 20). */
/* Parameter adjustments */
--a;
--w;
--lenh;
--lenl;
--ip;
 
/* Function Body */
*nval = 0;
i__1 = *wlen;
for (k = 1; k <= i__1; ++k) {
j = w[k];
i__2 = ip[j] + lenh[j] - 1;
for (ii = ip[j] + lenl[j]; ii <= i__2; ++ii) {
ha = a[ii];
if (*nval == 0) {
split[0] = ha;
*nval = 1;
} else {
/* Check presence of HA in SPLIT */
for (s = *nval; s >= 1; --s) {
if (split[s - 1] == ha) {
goto L15;
}
if (split[s - 1] > ha) {
pos = s + 1;
goto L21;
}
/* L20: */
}
pos = 1;
/* The insertion */
L21:
i__3 = pos;
for (s = *nval; s >= i__3; --s) {
split[s] = split[s - 1];
/* L22: */
}
split[pos - 1] = ha;
++(*nval);
}
/* Exit loop if XX values are found */
if (*nval == 10) {
goto L11;
}
L15:
;
}
/* L10: */
}
/* Determine VAL */
L11:
if (*nval > 0) {
*val = split[(*nval + 1) / 2 - 1];
}
return 0;
} /* mc64qd_ */
 
/* ********************************************************************** */
/* Subroutine */ int_t mc64ud_(int_t *id, int_t *mod, int_t *n, int_t *
irn, int_t *lirn, int_t *ip, int_t *lenc, int_t *fc, int_t *
iperm, int_t *num, int_t *numx, int_t *pr, int_t *arp,
int_t *cv, int_t *out)
{
/* System generated locals */
int_t i__1, i__2, i__3, i__4;
 
/* Local variables */
int_t i__, j, k, j1, ii, kk, id0, id1, in1, in2, nfc, num0, num1, num2,
jord, last;
 
 
/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
/* Research Councils *** */
/* *** Although every effort has been made to ensure robustness and *** */
/* *** reliability of the subroutines in this MC64 suite, we *** */
/* *** disclaim any liability arising through the use or misuse of *** */
/* *** any of the subroutines. *** */
/* *** Any problems? Contact ... */
/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */
 
/* PR(J) is the previous column to J in the depth first search. */
/* Array PR is used as workspace in the sorting algorithm. */
/* Elements (I,IPERM(I)) I=1,..,N are entries at the end of the */
/* algorithm unless N assignments have not been made in which case */
/* N-NUM pairs (I,IPERM(I)) will not be entries in the matrix. */
/* CV(I) is the most recent loop number (ID+JORD) at which row I */
/* was visited. */
/* ARP(J) is the number of entries in column J which have been scanned */
/* when looking for a cheap assignment. */
/* OUT(J) is one less than the number of entries in column J which have */
/* not been scanned during one pass through the main loop. */
/* NUMX is maximum possible size of matching. */
/* Parameter adjustments */
--out;
--cv;
--arp;
--pr;
--iperm;
--fc;
--lenc;
--ip;
--irn;
 
/* Function Body */
if (*id == 1) {
/* The first call to MC64U/UD. */
/* Initialize CV and ARP; parameters MOD, NUMX are not accessed */
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
cv[i__] = 0;
arp[i__] = 0;
/* L5: */
}
num1 = *n;
num2 = *n;
} else {
/* Not the first call to MC64U/UD. */
/* Re-initialize ARP if entries were deleted since last call to MC64U/UD */
if (*mod == 1) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
arp[i__] = 0;
/* L8: */
}
}
num1 = *numx;
num2 = *n - *numx;
}
num0 = *num;
/* NUM0 is size of input matching */
/* NUM1 is maximum possible size of matching */
/* NUM2 is maximum allowed number of unassigned rows/columns */
/* NUM is size of current matching */
/* Quick return if possible */
/* IF (NUM.EQ.N) GO TO 199 */
/* NFC is number of rows/columns that could not be assigned */
nfc = 0;
/* Integers ID0+1 to ID0+N are unique numbers for call ID to MC64U/UD, */
/* so 1st call uses 1..N, 2nd call uses N+1..2N, etc */
id0 = (*id - 1) * *n;
/* Main loop. Each pass round this loop either results in a new */
/* assignment or gives a column with no assignment */
i__1 = *n;
for (jord = num0 + 1; jord <= i__1; ++jord) {
/* Each pass uses unique number ID1 */
id1 = id0 + jord;
/* J is unmatched column */
j = fc[jord - num0];
pr[j] = -1;
i__2 = jord;
for (k = 1; k <= i__2; ++k) {
/* Look for a cheap assignment */
if (arp[j] >= lenc[j]) {
goto L30;
}
in1 = ip[j] + arp[j];
in2 = ip[j] + lenc[j] - 1;
i__3 = in2;
for (ii = in1; ii <= i__3; ++ii) {
i__ = irn[ii];
if (iperm[i__] == 0) {
goto L80;
}
/* L20: */
}
/* No cheap assignment in row */
arp[j] = lenc[j];
/* Begin looking for assignment chain starting with row J */
L30:
out[j] = lenc[j] - 1;
/* Inner loop. Extends chain by one or backtracks */
i__3 = jord;
for (kk = 1; kk <= i__3; ++kk) {
in1 = out[j];
if (in1 < 0) {
goto L50;
}
in2 = ip[j] + lenc[j] - 1;
in1 = in2 - in1;
/* Forward scan */
i__4 = in2;
for (ii = in1; ii <= i__4; ++ii) {
i__ = irn[ii];
if (cv[i__] == id1) {
goto L40;
}
/* Column J has not yet been accessed during this pass */
j1 = j;
j = iperm[i__];
cv[i__] = id1;
pr[j] = j1;
out[j1] = in2 - ii - 1;
goto L70;
L40:
;
}
/* Backtracking step. */
L50:
j1 = pr[j];
if (j1 == -1) {
/* No augmenting path exists for column J. */
++nfc;
fc[nfc] = j;
if (nfc > num2) {
/* A matching of maximum size NUM1 is not possible */
last = jord;
goto L101;
}
goto L100;
}
j = j1;
/* L60: */
}
/* End of dummy loop; this point is never reached */
L70:
;
}
/* End of dummy loop; this point is never reached */
/* New assignment is made. */
L80:
iperm[i__] = j;
arp[j] = ii - ip[j] + 1;
++(*num);
i__2 = jord;
for (k = 1; k <= i__2; ++k) {
j = pr[j];
if (j == -1) {
goto L95;
}
ii = ip[j] + lenc[j] - out[j] - 2;
i__ = irn[ii];
iperm[i__] = j;
/* L90: */
}
/* End of dummy loop; this point is never reached */
L95:
if (*num == num1) {
/* A matching of maximum size NUM1 is found */
last = jord;
goto L101;
}
 
L100:
;
}
/* All unassigned columns have been considered */
last = *n;
/* Now, a transversal is computed or is not possible. */
/* Complete FC before returning. */
L101:
i__1 = *n;
for (jord = last + 1; jord <= i__1; ++jord) {
++nfc;
fc[nfc] = fc[jord - num0];
/* L110: */
}
/* 199 RETURN */
return 0;
} /* mc64ud_ */
 
/* ********************************************************************** */
/* Subroutine */ int_t mc64wd_(int_t *n, int_t *ne, int_t *ip, int_t *
irn, double *a, int_t *iperm, int_t *num, int_t *jperm,
int_t *out, int_t *pr, int_t *q, int_t *l, double *u,
double *d__)
{
/* System generated locals */
int_t i__1, i__2, i__3;
 
/* Local variables */
int_t i__, j, k, i0, k0, k1, k2, q0;
double di;
int_t ii, jj, kk;
double vj;
int_t up;
double dq0;
int_t kk1, kk2;
double csp;
int_t isp, jsp, low;
double dmin__, dnew;
int_t jord, qlen, jdum;
double rinf;
extern /* Subroutine */ int_t mc64dd_(int_t *, int_t *, int_t *,
double *, int_t *, int_t *), mc64ed_(int_t *, int_t *,
int_t *, double *, int_t *, int_t *), mc64fd_(int_t *
, int_t *, int_t *, int_t *, double *, int_t *,
int_t *);
 
 
/* *** Copyright (c) 1999 Council for the Central Laboratory of the */
/* Research Councils *** */
/* *** Although every effort has been made to ensure robustness and *** */
/* *** reliability of the subroutines in this MC64 suite, we *** */
/* *** disclaim any liability arising through the use or misuse of *** */
/* *** any of the subroutines. *** */
/* *** Any problems? Contact ... */
/* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */
 
/* N, NE, IP, IRN are described in MC64A/AD. */
/* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */
/* A(K), K=1..NE, must be set to the value of the entry that */
/* corresponds to IRN(K). It is not altered. */
/* All values A(K) must be non-negative. */
/* IPERM is an INT_T array of length N. On exit, it contains the */
/* weighted matching: IPERM(I) = 0 or row I is matched to column */
/* IPERM(I). */
/* NUM is an INT_T variable. On exit, it contains the cardinality of */
/* the matching stored in IPERM. */
/* IW is an INT_T work array of length 5N. */
/* DW is a REAL (DOUBLE PRECISION in the D-version) array of length 2N. */
/* On exit, U = D(1:N) contains the dual row variable and */
/* V = D(N+1:2N) contains the dual column variable. If the matrix */
/* is structurally nonsingular (NUM = N), the following holds: */
/* U(I)+V(J) <= A(I,J) if IPERM(I) |= J */
/* U(I)+V(J) = A(I,J) if IPERM(I) = J */
/* U(I) = 0 if IPERM(I) = 0 */
/* V(J) = 0 if there is no I for which IPERM(I) = J */
/* Local variables */
/* Local parameters */
/* External subroutines and/or functions */
/* EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD */
/* DOUBLE PRECISION FD05AD */
/* Set RINF to largest positive real number */
/* XSL RINF = FD05AD(5) */
/* Parameter adjustments */
--d__;
--u;
--l;
--q;
--pr;
--out;
--jperm;
--iperm;
--ip;
--a;
--irn;
 
/* Function Body */
rinf = dlamch_("Overflow");
/* Initialization */
*num = 0;
i__1 = *n;
for (k = 1; k <= i__1; ++k) {
u[k] = rinf;
d__[k] = 0.;
iperm[k] = 0;
jperm[k] = 0;
pr[k] = ip[k];
l[k] = 0;
/* L10: */
}
/* Initialize U(I) */
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = ip[j + 1] - 1;
for (k = ip[j]; k <= i__2; ++k) {
i__ = irn[k];
if (a[k] > u[i__]) {
goto L20;
}
u[i__] = a[k];
iperm[i__] = j;
l[i__] = k;
L20:
;
}
/* L30: */
}
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
j = iperm[i__];
if (j == 0) {
goto L40;
}
/* Row I is not empty */
iperm[i__] = 0;
if (jperm[j] != 0) {
goto L40;
}
/* Assignment of column J to row I */
++(*num);
iperm[i__] = j;
jperm[j] = l[i__];
L40:
;
}
if (*num == *n) {
goto L1000;
}
/* Scan unassigned columns; improve assignment */
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
/* JPERM(J) ne 0 iff column J is already assigned */
if (jperm[j] != 0) {
goto L95;
}
k1 = ip[j];
k2 = ip[j + 1] - 1;
/* Continue only if column J is not empty */
if (k1 > k2) {
goto L95;
}
vj = rinf;
i__2 = k2;
for (k = k1; k <= i__2; ++k) {
i__ = irn[k];
di = a[k] - u[i__];
if (di > vj) {
goto L50;
}
if (di < vj || di == rinf) {
goto L55;
}
if (iperm[i__] != 0 || iperm[i0] == 0) {
goto L50;
}
L55:
vj = di;
i0 = i__;
k0 = k;
L50:
;
}
d__[j] = vj;
k = k0;
i__ = i0;
if (iperm[i__] == 0) {
goto L90;
}
i__2 = k2;
for (k = k0; k <= i__2; ++k) {
i__ = irn[k];
if (a[k] - u[i__] > vj) {
goto L60;
}
jj = iperm[i__];
/* Scan remaining part of assigned column JJ */
kk1 = pr[jj];
kk2 = ip[jj + 1] - 1;
if (kk1 > kk2) {
goto L60;
}
i__3 = kk2;
for (kk = kk1; kk <= i__3; ++kk) {
ii = irn[kk];
if (iperm[ii] > 0) {
goto L70;
}
if (a[kk] - u[ii] <= d__[jj]) {
goto L80;
}
L70:
;
}
pr[jj] = kk2 + 1;
L60:
;
}
goto L95;
L80:
jperm[jj] = kk;
iperm[ii] = jj;
pr[jj] = kk + 1;
L90:
++(*num);
jperm[j] = k;
iperm[i__] = j;
pr[j] = k + 1;
L95:
;
}
if (*num == *n) {
goto L1000;
}
/* Prepare for main loop */
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
d__[i__] = rinf;
l[i__] = 0;
/* L99: */
}
/* Main loop ... each pass round this loop is similar to Dijkstra's */
/* algorithm for solving the single source shortest path problem */
i__1 = *n;
for (jord = 1; jord <= i__1; ++jord) {
if (jperm[jord] != 0) {
goto L100;
}
/* JORD is next unmatched column */
/* DMIN is the length of shortest path in the tree */
dmin__ = rinf;
qlen = 0;
low = *n + 1;
up = *n + 1;
/* CSP is the cost of the shortest augmenting path to unassigned row */
/* IRN(ISP). The corresponding column index is JSP. */
csp = rinf;
/* Build shortest path tree starting from unassigned column (root) JORD */
j = jord;
pr[j] = -1;
/* Scan column J */
i__2 = ip[j + 1] - 1;
for (k = ip[j]; k <= i__2; ++k) {
i__ = irn[k];
dnew = a[k] - u[i__];
if (dnew >= csp) {
goto L115;
}
if (iperm[i__] == 0) {
csp = dnew;
isp = k;
jsp = j;
} else {
if (dnew < dmin__) {
dmin__ = dnew;
}
d__[i__] = dnew;
++qlen;
q[qlen] = k;
}
L115:
;
}
/* Initialize heap Q and Q2 with rows held in Q(1:QLEN) */
q0 = qlen;
qlen = 0;
i__2 = q0;
for (kk = 1; kk <= i__2; ++kk) {
k = q[kk];
i__ = irn[k];
if (csp <= d__[i__]) {
d__[i__] = rinf;
goto L120;
}
if (d__[i__] <= dmin__) {
--low;
q[low] = i__;
l[i__] = low;
} else {
++qlen;
l[i__] = qlen;
mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__2);
}
/* Update tree */
jj = iperm[i__];
out[jj] = k;
pr[jj] = j;
L120:
;
}
i__2 = *num;
for (jdum = 1; jdum <= i__2; ++jdum) {
/* If Q2 is empty, extract rows from Q */
if (low == up) {
if (qlen == 0) {
goto L160;
}
i__ = q[1];
if (d__[i__] >= csp) {
goto L160;
}
dmin__ = d__[i__];
L152:
mc64ed_(&qlen, n, &q[1], &d__[1], &l[1], &c__2);
--low;
q[low] = i__;
l[i__] = low;
if (qlen == 0) {
goto L153;
}
i__ = q[1];
if (d__[i__] > dmin__) {
goto L153;
}
goto L152;
}
/* Q0 is row whose distance D(Q0) to the root is smallest */
L153:
q0 = q[up - 1];
dq0 = d__[q0];
/* Exit loop if path to Q0 is longer than the shortest augmenting path */
if (dq0 >= csp) {
goto L160;
}
--up;
/* Scan column that matches with row Q0 */
j = iperm[q0];
vj = dq0 - a[jperm[j]] + u[q0];
i__3 = ip[j + 1] - 1;
for (k = ip[j]; k <= i__3; ++k) {
i__ = irn[k];
if (l[i__] >= up) {
goto L155;
}
/* DNEW is new cost */
dnew = vj + a[k] - u[i__];
/* Do not update D(I) if DNEW ge cost of shortest path */
if (dnew >= csp) {
goto L155;
}
if (iperm[i__] == 0) {
/* Row I is unmatched; update shortest path info */
csp = dnew;
isp = k;
jsp = j;
} else {
/* Row I is matched; do not update D(I) if DNEW is larger */
di = d__[i__];
if (di <= dnew) {
goto L155;
}
if (l[i__] >= low) {
goto L155;
}
d__[i__] = dnew;
if (dnew <= dmin__) {
if (l[i__] != 0) {
mc64fd_(&l[i__], &qlen, n, &q[1], &d__[1], &l[1],
&c__2);
}
--low;
q[low] = i__;
l[i__] = low;
} else {
if (l[i__] == 0) {
++qlen;
l[i__] = qlen;
}
mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__2);
}
/* Update tree */
jj = iperm[i__];
out[jj] = k;
pr[jj] = j;
}
L155:
;
}
/* L150: */
}
/* If CSP = RINF, no augmenting path is found */
L160:
if (csp == rinf) {
goto L190;
}
/* Find augmenting path by tracing backward in PR; update IPERM,JPERM */
++(*num);
i__ = irn[isp];
iperm[i__] = jsp;
jperm[jsp] = isp;
j = jsp;
i__2 = *num;
for (jdum = 1; jdum <= i__2; ++jdum) {
jj = pr[j];
if (jj == -1) {
goto L180;
}
k = out[j];
i__ = irn[k];
iperm[i__] = jj;
jperm[jj] = k;
j = jj;
/* L170: */
}
/* End of dummy loop; this point is never reached */
/* Update U for rows in Q(UP:N) */
L180:
i__2 = *n;
for (kk = up; kk <= i__2; ++kk) {
i__ = q[kk];
u[i__] = u[i__] + d__[i__] - csp;
/* L185: */
}
L190:
i__2 = *n;
for (kk = low; kk <= i__2; ++kk) {
i__ = q[kk];
d__[i__] = rinf;
l[i__] = 0;
/* L191: */
}
i__2 = qlen;
for (kk = 1; kk <= i__2; ++kk) {
i__ = q[kk];
d__[i__] = rinf;
l[i__] = 0;
/* L193: */
}
L100:
;
}
/* End of main loop */
/* Set dual column variable in D(1:N) */
L1000:
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
k = jperm[j];
if (k != 0) {
d__[j] = a[k] - u[irn[k]];
} else {
d__[j] = 0.;
}
if (iperm[j] == 0) {
u[j] = 0.;
}
/* L200: */
}
if (*num == *n) {
goto L1100;
}
/* The matrix is structurally singular, complete IPERM. */
/* JPERM, OUT are work arrays */
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
jperm[j] = 0;
/* L300: */
}
k = 0;
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
if (iperm[i__] == 0) {
++k;
out[k] = i__;
} else {
j = iperm[i__];
jperm[j] = i__;
}
/* L310: */
}
k = 0;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
if (jperm[j] != 0) {
goto L320;
}
++k;
jdum = out[k];
iperm[jdum] = j;
L320:
;
}
L1100:
return 0;
} /* mc64wd_ */
 
 
/trunk/OTHER/SuperLU_DIST_2.5/SRC/superlu_enum_consts.h
New file
0,0 → 1,72
/** @file superlu_enum_consts.h
* \brief enum constants header file
*
* -- SuperLU routine (version 4.1) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley,
* October 1, 2010
*
*/
 
#ifndef __SUPERLU_ENUM_CONSTS /* allow multiple inclusions */
#define __SUPERLU_ENUM_CONSTS
 
/***********************************************************************
* Enumerate types
***********************************************************************/
typedef enum {NO, YES} yes_no_t;
typedef enum {DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED} fact_t;
typedef enum {NOROWPERM, LargeDiag, MY_PERMR} rowperm_t;
typedef enum {NATURAL, MMD_ATA, MMD_AT_PLUS_A, COLAMD,
METIS_AT_PLUS_A, PARMETIS, ZOLTAN, MY_PERMC} colperm_t;
typedef enum {NOTRANS, TRANS, CONJ} trans_t;
typedef enum {NOEQUIL, ROW, COL, BOTH} DiagScale_t;
typedef enum {NOREFINE, SINGLE=1, DOUBLE, EXTRA} IterRefine_t;
typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType;
typedef enum {HEAD, TAIL} stack_end_t;
typedef enum {SYSTEM, USER} LU_space_t;
typedef enum {ONE_NORM, TWO_NORM, INF_NORM} norm_t;
typedef enum {SILU, SMILU_1, SMILU_2, SMILU_3} milu_t;
#if 0
typedef enum {NODROP = 0x0000,
DROP_BASIC = 0x0001, /* ILU(tau) */
DROP_PROWS = 0x0002, /* ILUTP: keep p maximum rows */
DROP_COLUMN = 0x0004, /* ILUTP: for j-th column,
p = gamma * nnz(A(:,j)) */
DROP_AREA = 0x0008, /* ILUTP: for j-th column, use
nnz(F(:,1:j)) / nnz(A(:,1:j))
to limit memory growth */
DROP_SECONDARY = 0x000E, /* PROWS | COLUMN | AREA */
DROP_DYNAMIC = 0x0010,
DROP_INTERP = 0x0100} rule_t;
#endif
 
 
/*
* The following enumerate type is used by the statistics variable
* to keep track of flop count and time spent at various stages.
*
* Note that not all of the fields are disjoint.
*/
typedef enum {
COLPERM, /* find a column ordering that minimizes fills */
ROWPERM, /* find a row ordering maximizes diagonal. */
RELAX, /* find artificial supernodes */
ETREE, /* compute column etree */
EQUIL, /* equilibrate the original matrix */
SYMBFAC, /* symbolic factorization. */
DIST, /* distribute matrix. */
FACT, /* perform LU factorization */
COMM, /* communication for factorization */
SOL_COMM,/* communication for solve */
RCOND, /* estimate reciprocal condition number */
SOLVE, /* forward and back solves */
REFINE, /* perform iterative refinement */
FLOAT, /* time spent in floating-point operations */
TRSV, /* fraction of FACT spent in xTRSV */
GEMV, /* fraction of FACT spent in xGEMV */
FERR, /* estimate error bounds after iterative refinement */
NPHASES /* total number of phases */
} PhaseType;
 
 
#endif /* __SUPERLU_ENUM_CONSTS */
/trunk/OTHER/SuperLU_DIST_2.5/SRC/pdgstrf_X1.c
New file
0,0 → 1,1337
/*! @file
* \brief Performs the LU factorization in parallel
*
* <pre>
* -- Distributed SuperLU routine (version 1.0) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* September 1, 1999
*
* Modified:
* Feburary 7, 2001 use MPI_Isend/MPI_Irecv
*
* Sketch of the algorithm
* =======================
*
* The following relations hold:
* * A_kk = L_kk * U_kk
* * L_ik = Aik * U_kk^(-1)
* * U_kj = L_kk^(-1) * A_kj
*
* ----------------------------------
* | | |
* ----|-----------------------------
* | | \ U_kk| |
* | | \ | U_kj |
* | |L_kk \ | || |
* ----|-------|---------||----------
* | | | \/ |
* | | | |
* | | | |
* | | | |
* | | L_ik ==> A_ij |
* | | | |
* | | | |
* | | | |
* ----------------------------------
*
* Handle the first block of columns separately.
* * Factor diagonal and subdiagonal blocks and test for exact
* singularity. ( pdgstrf2(0), one column at a time )
* * Compute block row of U
* * Update trailing matrix
*
* Loop over the remaining blocks of columns.
* mycol = MYCOL( iam, grid );
* myrow = MYROW( iam, grid );
* N = nsupers;
* For (k = 1; k < N; ++k) {
* krow = PROW( k, grid );
* kcol = PCOL( k, grid );
* Pkk = PNUM( krow, kcol, grid );
*
* * Factor diagonal and subdiagonal blocks and test for exact
* singularity.
* if ( mycol == kcol ) {
* pdgstrf2(k), one column at a time
* }
*
* * Parallel triangular solve
* if ( iam == Pkk ) multicast L_k,k to this process row;
* if ( myrow == krow && mycol != kcol ) {
* Recv L_k,k from process Pkk;
* for (j = k+1; j < N; ++j)
* if ( PCOL( j, grid ) == mycol && A_k,j != 0 )
* U_k,j = L_k,k \ A_k,j;
* }
*
* * Parallel rank-k update
* if ( myrow == krow ) multicast U_k,k+1:N to this process column;
* if ( mycol == kcol ) multicast L_k+1:N,k to this process row;
* if ( myrow != krow ) {
* Pkj = PNUM( krow, mycol, grid );
* Recv U_k,k+1:N from process Pkj;
* }
* if ( mycol != kcol ) {
* Pik = PNUM( myrow, kcol, grid );
* Recv L_k+1:N,k from process Pik;
* }
* for (j = k+1; k < N; ++k) {
* for (i = k+1; i < N; ++i)
* if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
* && L_i,k != 0 && U_k,j != 0 )
* A_i,j = A_i,j - L_i,k * U_k,j;
* }
* }
*
*
* Remaining issues
* (1) Use local indices for L subscripts and SPA. [DONE]
* </pre>
*/
 
#include <math.h>
#include "superlu_ddefs.h"
#define CRAY_X1
#if ( VAMPIR>=1 )
#include <VT.h>
#endif
 
/*
* Internal prototypes
*/
static void pdgstrf2(superlu_options_t *, int_t, double, Glu_persist_t *,
gridinfo_t *, LocalLU_t *, SuperLUStat_t *, int *);
#ifdef _CRAY
static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd);
#else
static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *,
LocalLU_t *, SuperLUStat_t *);
#endif
 
/*
*
*/
/************************************************************************/
/*! \brief
*
* <pre>
* Purpose
* =======
*
* PDGSTRF performs the LU factorization in parallel.
*
* Arguments
* =========
*
* options (input) superlu_options_t*
* The structure defines the input parameters to control
* how the LU decomposition will be performed.
* The following field should be defined:
* o ReplaceTinyPivot (yes_no_t)
* Specifies whether to replace the tiny diagonals by
* sqrt(epsilon)*norm(A) during LU factorization.
*
* m (input) int
* Number of rows in the matrix.
*
* n (input) int
* Number of columns in the matrix.
*
* anorm (input) double
* The norm of the original matrix A, or the scaled A if
* equilibration was done.
*
* LUstruct (input/output) LUstruct_t*
* The data structures to store the distributed L and U factors.
* The following fields should be defined:
*
* o Glu_persist (input) Glu_persist_t*
* Global data structure (xsup, supno) replicated on all processes,
* describing the supernode partition in the factored matrices
* L and U:
* xsup[s] is the leading column of the s-th supernode,
* supno[i] is the supernode number to which column i belongs.
*
* o Llu (input/output) LocalLU_t*
* The distributed data structures to store L and U factors.
* See superlu_ddefs.h for the definition of 'LocalLU_t'.
*
* grid (input) gridinfo_t*
* The 2D process mesh. It contains the MPI communicator, the number
* of process rows (NPROW), the number of process columns (NPCOL),
* and my process rank. It is an input argument to all the
* parallel routines.
* Grid can be initialized by subroutine SUPERLU_GRIDINIT.
* See superlu_ddefs.h for the definition of 'gridinfo_t'.
*
* stat (output) SuperLUStat_t*
* Record the statistics on runtime and floating-point operation count.
* See util.h for the definition of 'SuperLUStat_t'.
*
* info (output) int*
* = 0: successful exit
* < 0: if info = -i, the i-th argument had an illegal value
* > 0: if info = i, U(i,i) is exactly zero. The factorization has
* been completed, but the factor U is exactly singular,
* and division by zero will occur if it is used to solve a
* system of equations.
* </pre>
*/
void pdgstrf
/************************************************************************/
(
superlu_options_t *options, int m, int n, double anorm,
LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info
)
 
{
#ifdef _CRAY
_fcd ftcs = _cptofcd("N", strlen("N"));
_fcd ftcs1 = _cptofcd("L", strlen("L"));
_fcd ftcs2 = _cptofcd("N", strlen("N"));
_fcd ftcs3 = _cptofcd("U", strlen("U"));
#endif
double alpha = 1.0, beta = 0.0;
int_t *xsup;
int_t *lsub, *lsub1, *usub, *Usub_buf,
*Lsub_buf_2[2]; /* Need 2 buffers to implement Irecv. */
double *lusup, *lusup1, *uval, *Uval_buf,
*Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */
int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc,
lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj,
nlb, nub, nsupc, rel, rukp;
int_t Pc, Pr;
int iam, kcol, krow, mycol, myrow, pi, pj;
int j, k, lk, nsupers;
int nsupr, nbrow, segsize;
int msgcnt[4]; /* Count the size of the message xfer'd in each buffer:
* 0 : transferred in Lsub_buf[]
* 1 : transferred in Lval_buf[]
* 2 : transferred in Usub_buf[]
* 3 : transferred in Uval_buf[]
*/
int_t msg0, msg2;
int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr;
double **Unzval_br_ptr, **Lnzval_bc_ptr;
int_t *index;
double *nzval;
int_t *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */
double *ucol;
int_t *indirect;
double *tempv, *tempv2d;
int_t iinfo;
int_t *ToRecv, *ToSendD, **ToSendR;
Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
LocalLU_t *Llu = LUstruct->Llu;
superlu_scope_t *scp;
double s_eps, thresh;
double *tempU2d, *tempu;
int full, ldt, ldu, lead_zero, ncols;
MPI_Request recv_req[4], *send_req;
MPI_Status status;
#ifdef CRAY_X1
int nonzero_segs;
#endif
#if ( DEBUGlevel>=2 )
int_t num_copy=0, num_update=0;
#endif
#if ( PRNTlevel==3 )
int_t zero_msg = 0, total_msg = 0;
#endif
#if ( PROFlevel>=1 )
double t1, t2;
float msg_vol = 0, msg_cnt = 0;
int_t iword = sizeof(int_t), dword = sizeof(double);
#endif
 
/* Test the input parameters. */
*info = 0;
if ( m < 0 ) *info = -2;
else if ( n < 0 ) *info = -3;
if ( *info ) {
pxerbla("pdgstrf", grid, -*info);
return;
}
 
/* Quick return if possible. */
if ( m == 0 || n == 0 ) return;
 
/*
* Initialization.
*/
iam = grid->iam;
Pc = grid->npcol;
Pr = grid->nprow;
myrow = MYROW( iam, grid );
mycol = MYCOL( iam, grid );
nsupers = Glu_persist->supno[n-1] + 1;
xsup = Glu_persist->xsup;
s_eps = slamch_("Epsilon");
thresh = s_eps * anorm;
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Enter pdgstrf()");
#endif
 
stat->ops[FACT] = 0.0;
 
if ( Pr*Pc > 1 ) {
i = Llu->bufmax[0];
if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) )
ABORT("Malloc fails for Lsub_buf.");
Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i;
i = Llu->bufmax[1];
if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) )
ABORT("Malloc fails for Lval_buf[].");
Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i;
if ( Llu->bufmax[2] != 0 )
if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) )
ABORT("Malloc fails for Usub_buf[].");
if ( Llu->bufmax[3] != 0 )
if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) )
ABORT("Malloc fails for Uval_buf[].");
if ( !(send_req =
(MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request))))
ABORT("Malloc fails for send_req[].");
}
if ( !(Llu->ujrow = doubleMalloc_dist(sp_ienv_dist(3))) )
ABORT("Malloc fails for ujrow[].");
 
#if ( PRNTlevel>=1 )
if ( !iam ) {
printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh);
printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n",
Llu->bufmax[0], Llu->bufmax[1],
Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]);
}
#endif
 
Lsub_buf_2[0] = Llu->Lsub_buf_2[0];
Lsub_buf_2[1] = Llu->Lsub_buf_2[1];
Lval_buf_2[0] = Llu->Lval_buf_2[0];
Lval_buf_2[1] = Llu->Lval_buf_2[1];
Usub_buf = Llu->Usub_buf;
Uval_buf = Llu->Uval_buf;
Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
Unzval_br_ptr = Llu->Unzval_br_ptr;
ToRecv = Llu->ToRecv;
ToSendD = Llu->ToSendD;
ToSendR = Llu->ToSendR;
 
ldt = sp_ienv_dist(3); /* Size of maximum supernode */
if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) )
ABORT("Calloc fails for tempv2d[].");
tempU2d = tempv2d + ldt*ldt;
#ifdef CRAY_X1
if ( !(indirect = intMalloc_dist(2*ldt)) )
ABORT("Malloc fails for indirect[].");
#else
if ( !(indirect = intMalloc_dist(ldt)) )
ABORT("Malloc fails for indirect[].");
#endif
k = CEILING( nsupers, Pr ); /* Number of local block rows */
if ( !(iuip = intMalloc_dist(k)) )
ABORT("Malloc fails for iuip[].");
if ( !(ruip = intMalloc_dist(k)) )
ABORT("Malloc fails for ruip[].");
 
#if ( VAMPIR>=1 )
VT_symdef(1, "Send-L", "Comm");
VT_symdef(2, "Recv-L", "Comm");
VT_symdef(3, "Send-U", "Comm");
VT_symdef(4, "Recv-U", "Comm");
VT_symdef(5, "TRF2", "Factor");
VT_symdef(100, "Factor", "Factor");
VT_begin(100);
VT_traceon();
#endif
 
/* ---------------------------------------------------------------
Handle the first block column separately to start the pipeline.
--------------------------------------------------------------- */
if ( mycol == 0 ) {
#if ( VAMPIR>=1 )
VT_begin(5);
#endif
pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info);
#if ( VAMPIR>=1 )
VT_end(5);
#endif
 
scp = &grid->rscp; /* The scope of process row. */
 
/* Process column *kcol* multicasts numeric values of L(:,k)
to process rows. */
lsub = Lrowind_bc_ptr[0];
lusup = Lnzval_bc_ptr[0];
if ( lsub ) {
msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR;
msgcnt[1] = lsub[1] * SuperSize( 0 );
} else {
msgcnt[0] = msgcnt[1] = 0;
}
for (pj = 0; pj < Pc; ++pj) {
if ( ToSendR[0][pj] != EMPTY ) {
#if ( PROFlevel>=1 )
TIC(t1);
#endif
#if ( VAMPIR>=1 )
VT_begin(1);
#endif
MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm,
&send_req[pj] );
MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm,
&send_req[pj+Pc] );
#if ( DEBUGlevel>=2 )
printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
iam, 0, msgcnt[0], msgcnt[1], pj);
#endif
#if ( VAMPIR>=1 )
VT_end(1);
#endif
#if ( PROFlevel>=1 )
TOC(t2, t1);
stat->utime[COMM] += t2;
msg_cnt += 2;
msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;
#endif
}
} /* for pj ... */
} else { /* Post immediate receives. */
if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */
scp = &grid->rscp; /* The scope of process row. */
MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0,
0, scp->comm, &recv_req[0] );
MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0,
1, scp->comm, &recv_req[1] );
#if ( DEBUGlevel>=2 )
printf("(%d) Post Irecv L(:,%4d)\n", iam, 0);
#endif
}
} /* if mycol == 0 */
 
/* ------------------------------------------
MAIN LOOP: Loop through all block columns.
------------------------------------------ */
for (k = 0; k < nsupers; ++k) {
 
knsupc = SuperSize( k );
krow = PROW( k, grid );
kcol = PCOL( k, grid );
 
if ( mycol == kcol ) {
lk = LBj( k, grid ); /* Local block number. */
 
for (pj = 0; pj < Pc; ++pj) {
/* Wait for Isend to complete before using lsub/lusup. */
if ( ToSendR[lk][pj] != EMPTY ) {
MPI_Wait( &send_req[pj], &status );
MPI_Wait( &send_req[pj+Pc], &status );
}
}
lsub = Lrowind_bc_ptr[lk];
lusup = Lnzval_bc_ptr[lk];
} else {
if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */
scp = &grid->rscp; /* The scope of process row. */
#if ( PROFlevel>=1 )
TIC(t1);
#endif
#if ( VAMPIR>=1 )
VT_begin(2);
#endif
/*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm,
Llu->bufmax[0]);*/
/*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol,
(4*k)%NTAGS, scp->comm, &status );*/
MPI_Wait( &recv_req[0], &status );
MPI_Get_count( &status, mpi_int_t, &msgcnt[0] );
/*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm,
Llu->bufmax[1]);*/
/*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol,
(4*k+1)%NTAGS, scp->comm, &status );*/
MPI_Wait( &recv_req[1], &status );
MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] );
#if ( VAMPIR>=1 )
VT_end(2);
#endif
#if ( PROFlevel>=1 )
TOC(t2, t1);
stat->utime[COMM] += t2;
#endif
#if ( DEBUGlevel>=2 )
printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n",
iam, k, msgcnt[0], msgcnt[1], kcol);
fflush(stdout);
#endif
lsub = Lsub_buf_2[k%2];
lusup = Lval_buf_2[k%2];
#if ( PRNTlevel==3 )
++total_msg;
if ( !msgcnt[0] ) ++zero_msg;
#endif
} else msgcnt[0] = 0;
} /* if mycol = Pc(k) */
 
scp = &grid->cscp; /* The scope of process column. */
 
if ( myrow == krow ) {
/* Parallel triangular solve across process row *krow* --
U(k,j) = L(k,k) \ A(k,j). */
#ifdef _CRAY
pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3);
#else
pdgstrs2(n, k, Glu_persist, grid, Llu, stat);
#endif
 
/* Multicasts U(k,:) to process columns. */
lk = LBi( k, grid );
usub = Ufstnz_br_ptr[lk];
uval = Unzval_br_ptr[lk];
if ( usub ) {
msgcnt[2] = usub[2];
msgcnt[3] = usub[1];
} else {
msgcnt[2] = msgcnt[3] = 0;
}
 
if ( ToSendD[lk] == YES ) {
for (pi = 0; pi < Pr; ++pi) {
if ( pi != myrow ) {
#if ( PROFlevel>=1 )
TIC(t1);
#endif
#if ( VAMPIR>=1 )
VT_begin(3);
#endif
MPI_Send( usub, msgcnt[2], mpi_int_t, pi,
(4*k+2)%NTAGS, scp->comm);
MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi,
(4*k+3)%NTAGS, scp->comm);
#if ( VAMPIR>=1 )
VT_end(3);
#endif
#if ( PROFlevel>=1 )
TOC(t2, t1);
stat->utime[COMM] += t2;
msg_cnt += 2;
msg_vol += msgcnt[2]*iword + msgcnt[3]*dword;
#endif
#if ( DEBUGlevel>=2 )
printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi);
#endif
} /* if pi ... */
} /* for pi ... */
} /* if ToSendD ... */
} else { /* myrow != krow */
if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */
#if ( PROFlevel>=1 )
TIC(t1);
#endif
#if ( VAMPIR>=1 )
VT_begin(4);
#endif
/*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm,
Llu->bufmax[2]);*/
MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
(4*k+2)%NTAGS, scp->comm, &status );
MPI_Get_count( &status, mpi_int_t, &msgcnt[2] );
/*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm,
Llu->bufmax[3]);*/
MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow,
(4*k+3)%NTAGS, scp->comm, &status );
MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] );
#if ( VAMPIR>=1 )
VT_end(4);
#endif
#if ( PROFlevel>=1 )
TOC(t2, t1);
stat->utime[COMM] += t2;
#endif
usub = Usub_buf;
uval = Uval_buf;
#if ( DEBUGlevel>=2 )
printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow);
#endif
#if ( PRNTlevel==3 )
++total_msg;
if ( !msgcnt[2] ) ++zero_msg;
#endif
} else msgcnt[2] = 0;
} /* if myrow == Pr(k) */
/*
* Parallel rank-k update; pair up blocks L(i,k) and U(k,j).
* for (j = k+1; k < N; ++k) {
* for (i = k+1; i < N; ++i)
* if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
* && L(i,k) != 0 && U(k,j) != 0 )
* A(i,j) = A(i,j) - L(i,k) * U(k,j);
*/
msg0 = msgcnt[0];
msg2 = msgcnt[2];
if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
nsupr = lsub[1]; /* LDA of lusup. */
if ( myrow == krow ) { /* Skip diagonal block L(k,k). */
lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1];
luptr0 = knsupc;
nlb = lsub[0] - 1;
} else {
lptr0 = BC_HEADER;
luptr0 = 0;
nlb = lsub[0];
}
lptr = lptr0;
for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */
ib = lsub[lptr];
lib = LBi( ib, grid );
iuip[lib] = BR_HEADER;
ruip[lib] = 0;
lptr += LB_DESCRIPTOR + lsub[lptr+1];
}
nub = usub[0]; /* Number of blocks in the block row U(k,:) */
iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */
rukp = 0; /* Pointer to nzval[] of U(k,:) */
klst = FstBlockC( k+1 );
/* ---------------------------------------------------
Update the first block column A(:,k+1).
--------------------------------------------------- */
jb = usub[iukp]; /* Global block number of block U(k,j). */
if ( jb == k+1 ) { /* First update (k+1)-th block. */
--nub;
lptr = lptr0;
luptr = luptr0;
ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
nsupc = SuperSize( jb );
iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
 
/* Prepare to call DGEMM. */
jj = iukp;
while ( usub[jj] == klst ) ++jj;
ldu = klst - usub[jj++];
ncols = 1;
full = 1;
for (; jj < iukp+nsupc; ++jj) {
segsize = klst - usub[jj];
if ( segsize ) {
++ncols;
if ( segsize != ldu ) full = 0;
if ( segsize > ldu ) ldu = segsize;
}
}
#if ( DEBUGlevel>=3 )
++num_update;
#endif
if ( full ) {
tempu = &uval[rukp];
} else { /* Copy block U(k,j) into tempU2d. */
#if ( DEBUGlevel>=3 )
printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
iam, full, k, jb, ldu, ncols, nsupc);
++num_copy;
#endif
tempu = tempU2d;
for (jj = iukp; jj < iukp+nsupc; ++jj) {
segsize = klst - usub[jj];
if ( segsize ) {
lead_zero = ldu - segsize;
for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0;
tempu += lead_zero;
for (i = 0; i < segsize; ++i)
tempu[i] = uval[rukp+i];
rukp += segsize;
tempu += segsize;
}
}
tempu = tempU2d;
rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
} /* if full ... */
 
for (lb = 0; lb < nlb; ++lb) {
ib = lsub[lptr]; /* Row block L(i,k). */
nbrow = lsub[lptr+1]; /* Number of full rows. */
lptr += LB_DESCRIPTOR; /* Skip descriptor. */
tempv = tempv2d;
#ifdef _CRAY
SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha,
&lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
tempu, &ldu, &beta, tempv, &ldt);
#else
dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha,
&lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
tempu, &ldu, &beta, tempv, &ldt);
#endif
stat->ops[FACT] += 2 * nbrow * ldu * ncols;
 
/* Now gather the result into the destination block. */
if ( ib < jb ) { /* A(i,j) is in U. */
ilst = FstBlockC( ib+1 );
lib = LBi( ib, grid );
index = Ufstnz_br_ptr[lib];
ijb = index[iuip[lib]];
while ( ijb < jb ) { /* Search for dest block. */
ruip[lib] += index[iuip[lib]+1];
iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
ijb = index[iuip[lib]];
}
iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */
 
tempv = tempv2d;
for (jj = 0; jj < nsupc; ++jj) {
segsize = klst - usub[iukp + jj];
fnz = index[iuip[lib]++];
if ( segsize ) { /* Nonzero segment in U(k.j). */
ucol = &Unzval_br_ptr[lib][ruip[lib]];
for (i = 0, it = 0; i < nbrow; ++i) {
rel = lsub[lptr + i] - fnz;
ucol[rel] -= tempv[it++];
}
tempv += ldt;
}
ruip[lib] += ilst - fnz;
}
} else { /* A(i,j) is in L. */
index = Lrowind_bc_ptr[ljb];
ldv = index[1]; /* LDA of the dest lusup. */
lptrj = BC_HEADER;
luptrj = 0;
ijb = index[lptrj];
while ( ijb != ib ) { /* Search for dest block --
blocks are not ordered! */
luptrj += index[lptrj+1];
lptrj += LB_DESCRIPTOR + index[lptrj+1];
ijb = index[lptrj];
}
/*
* Build indirect table. This is needed because the
* indices are not sorted.
*/
fnz = FstBlockC( ib );
lptrj += LB_DESCRIPTOR;
for (i = 0; i < index[lptrj-1]; ++i) {
rel = index[lptrj + i] - fnz;
indirect[rel] = i;
}
nzval = Lnzval_bc_ptr[ljb] + luptrj;
tempv = tempv2d;
for (jj = 0; jj < nsupc; ++jj) {
segsize = klst - usub[iukp + jj];
if ( segsize ) {
/*#pragma _CRI cache_bypass nzval,tempv*/
for (it = 0, i = 0; i < nbrow; ++i) {
rel = lsub[lptr + i] - fnz;
nzval[indirect[rel]] -= tempv[it++];
}
tempv += ldt;
}
nzval += ldv;
}
} /* if ib < jb ... */
lptr += nbrow;
luptr += nbrow;
} /* for lb ... */
rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
iukp += nsupc;
} /* if jb == k+1 */
} /* if L(:,k) and U(k,:) not empty */
 
 
if ( k+1 < nsupers ) {
kcol = PCOL( k+1, grid );
if ( mycol == kcol ) {
#if ( VAMPIR>=1 )
VT_begin(5);
#endif
/* Factor diagonal and subdiagonal blocks and test for exact
singularity. */
pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, stat, info);
#if ( VAMPIR>=1 )
VT_end(5);
#endif
 
/* Process column *kcol+1* multicasts numeric values of L(:,k+1)
to process rows. */
lk = LBj( k+1, grid ); /* Local block number. */
lsub1 = Lrowind_bc_ptr[lk];
if ( lsub1 ) {
msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR;
msgcnt[1] = lsub1[1] * SuperSize( k+1 );
} else {
msgcnt[0] = 0;
msgcnt[1] = 0;
}
scp = &grid->rscp; /* The scope of process row. */
for (pj = 0; pj < Pc; ++pj) {
if ( ToSendR[lk][pj] != EMPTY ) {
lusup1 = Lnzval_bc_ptr[lk];
#if ( PROFlevel>=1 )
TIC(t1);
#endif
#if ( VAMPIR>=1 )
VT_begin(1);
#endif
MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj,
(4*(k+1))%NTAGS, scp->comm, &send_req[pj] );
MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj,
(4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] );
#if ( VAMPIR>=1 )
VT_end(1);
#endif
#if ( PROFlevel>=1 )
TOC(t2, t1);
stat->utime[COMM] += t2;
msg_cnt += 2;
msg_vol += msgcnt[0]*iword + msgcnt[1]*dword;
#endif
#if ( DEBUGlevel>=2 )
printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n",
iam, k+1, msgcnt[0], msgcnt[1], pj);
#endif
}
} /* for pj ... */
} else { /* Post Recv of block column L(:,k+1). */
if ( ToRecv[k+1] >= 1 ) {
scp = &grid->rscp; /* The scope of process row. */
MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol,
(4*(k+1))%NTAGS, scp->comm, &recv_req[0]);
MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol,
(4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]);
#if ( DEBUGlevel>=2 )
printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1);
#endif
}
} /* if mycol == Pc(k+1) */
} /* if k+1 < nsupers */
 
if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
/* ---------------------------------------------------
Update all other blocks using block row U(k,:)
--------------------------------------------------- */
for (j = 0; j < nub; ++j) {
lptr = lptr0;
luptr = luptr0;
jb = usub[iukp]; /* Global block number of block U(k,j). */
ljb = LBj( jb, grid ); /* Local block number of U(k,j). */
nsupc = SuperSize( jb );
iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
 
/* Prepare to call DGEMM. */
jj = iukp;
while ( usub[jj] == klst ) ++jj;
ldu = klst - usub[jj++];
ncols = 1;
full = 1;
for (; jj < iukp+nsupc; ++jj) {
segsize = klst - usub[jj];
if ( segsize ) {
++ncols;
if ( segsize != ldu ) full = 0;
if ( segsize > ldu ) ldu = segsize;
}
}
#if ( DEBUGlevel>=3 )
printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
iam, full, k, jb, ldu, ncols, nsupc);
++num_update;
#endif
if ( full ) {
tempu = &uval[rukp];
} else { /* Copy block U(k,j) into tempU2d. */
#if ( DEBUGlevel>=3 )
++num_copy;
#endif
tempu = tempU2d;
for (jj = iukp; jj < iukp+nsupc; ++jj) {
segsize = klst - usub[jj];
if ( segsize ) {
lead_zero = ldu - segsize;
for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0;
tempu += lead_zero;
for (i = 0; i < segsize; ++i)
tempu[i] = uval[rukp+i];
rukp += segsize;
tempu += segsize;
}
}
tempu = tempU2d;
rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */
} /* if full ... */
 
for (lb = 0; lb < nlb; ++lb) {
ib = lsub[lptr]; /* Row block L(i,k). */
nbrow = lsub[lptr+1]; /* Number of full rows. */
lptr += LB_DESCRIPTOR; /* Skip descriptor. */
tempv = tempv2d;
#ifdef _CRAY
SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha,
&lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
tempu, &ldu, &beta, tempv, &ldt);
#else
dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha,
&lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
tempu, &ldu, &beta, tempv, &ldt);
#endif
stat->ops[FACT] += 2 * nbrow * ldu * ncols;
 
/* Now gather the result into the destination block. */
if ( ib < jb ) { /* A(i,j) is in U. */
ilst = FstBlockC( ib+1 );
lib = LBi( ib, grid );
index = Ufstnz_br_ptr[lib];
ijb = index[iuip[lib]];
while ( ijb < jb ) { /* Search for dest block. */
ruip[lib] += index[iuip[lib]+1];
iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb );
ijb = index[iuip[lib]];
}
/* Skip descriptor. Now point to fstnz index of
block U(i,j). */
iuip[lib] += UB_DESCRIPTOR;
 
tempv = tempv2d;
for (jj = 0; jj < nsupc; ++jj) {
segsize = klst - usub[iukp + jj];
fnz = index[iuip[lib]++];
if ( segsize ) { /* Nonzero segment in U(k.j). */
ucol = &Unzval_br_ptr[lib][ruip[lib]];
for (i = 0 ; i < nbrow; ++i) {
rel = lsub[lptr + i] - fnz;
ucol[rel] -= tempv[i];
}
tempv += ldt;
}
ruip[lib] += ilst - fnz;
}
} else { /* A(i,j) is in L. */
index = Lrowind_bc_ptr[ljb];
ldv = index[1]; /* LDA of the dest lusup. */
lptrj = BC_HEADER;
luptrj = 0;
ijb = index[lptrj];
while ( ijb != ib ) { /* Search for dest block --
blocks are not ordered! */
luptrj += index[lptrj+1];
lptrj += LB_DESCRIPTOR + index[lptrj+1];
ijb = index[lptrj];
}
/*
* Build indirect table. This is needed because the
* indices are not sorted for the L blocks.
*/
fnz = FstBlockC( ib );
lptrj += LB_DESCRIPTOR;
for (i = 0; i < index[lptrj-1]; ++i) {
rel = index[lptrj + i] - fnz;
indirect[rel] = i;
}
nzval = Lnzval_bc_ptr[ljb] + luptrj;
tempv = tempv2d;
for (jj = 0; jj < nsupc; ++jj) {
segsize = klst - usub[iukp + jj];
if ( segsize ) {
/*#pragma _CRI cache_bypass nzval,tempv*/
for (i = 0; i < nbrow; ++i) {
rel = lsub[lptr + i] - fnz;
nzval[indirect[rel]] -= tempv[i];
}
tempv += ldt;
}
nzval += ldv;
}
} /* if ib < jb ... */
lptr += nbrow;
luptr += nbrow;
} /* for lb ... */
rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */
iukp += nsupc;
} /* for j ... */
} /* if k L(:,k) and U(k,:) are not empty */
 
}
/* ------------------------------------------
END MAIN LOOP: for k = ...
------------------------------------------ */
 
#if ( VAMPIR>=1 )
VT_end(100);
VT_traceoff();
#endif
 
if ( Pr*Pc > 1 ) {
SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */
SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */
if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf);
if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf);
SUPERLU_FREE(send_req);
}
 
SUPERLU_FREE(Llu->ujrow);
SUPERLU_FREE(tempv2d);
SUPERLU_FREE(indirect);
SUPERLU_FREE(iuip);
SUPERLU_FREE(ruip);
 
/* Prepare error message. */
if ( *info == 0 ) *info = n + 1;
#if ( PROFlevel>=1 )
TIC(t1);
#endif
MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm );
#if ( PROFlevel>=1 )
TOC(t2, t1);
stat->utime[COMM] += t2;
{
float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
MPI_Reduce( &msg_cnt, &msg_cnt_sum,
1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
MPI_Reduce( &msg_cnt, &msg_cnt_max,
1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
MPI_Reduce( &msg_vol, &msg_vol_sum,
1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
MPI_Reduce( &msg_vol, &msg_vol_max,
1, MPI_FLOAT, MPI_MAX, 0, grid->comm );
if ( !iam ) {
printf("\tPDGSTRF comm stat:"
"\tAvg\tMax\t\tAvg\tMax\n"
"\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n",
msg_cnt_sum/Pr/Pc, msg_cnt_max,
msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6);
}
}
#endif
if ( iinfo == n + 1 ) *info = 0;
else *info = iinfo;
 
 
#if ( PRNTlevel==3 )
MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo);
MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm );
if ( !iam ) printf(".. # total msg\t%d\n", iinfo);
#endif
 
#if ( PRNTlevel==2 )
for (i = 0; i < Pr * Pc; ++i) {
if ( iam == i ) {
dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu);
dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu);
printf("(%d)\n", iam);
PrintInt10("Recv", nsupers, Llu->ToRecv);
}
MPI_Barrier( grid->comm );
}
#endif
 
#if ( DEBUGlevel>=3 )
printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update);
#endif
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit pdgstrf()");
#endif
} /* PDGSTRF */
 
 
/************************************************************************/
/*! \brief
*
* <pre>
* Purpose
* =======
* Factor diagonal and subdiagonal blocks and test for exact singularity.
* Only the process column that owns block column *k* participates
* in the work.
*
* Arguments
* =========
*
* k (input) int (global)
* The column number of the block column to be factorized.
*
* thresh (input) double (global)
* The threshold value = s_eps * anorm.
*
* Glu_persist (input) Glu_persist_t*
* Global data structures (xsup, supno) replicated on all processes.
*
* grid (input) gridinfo_t*
* The 2D process mesh.
*
* Llu (input/output) LocalLU_t*
* Local data structures to store distributed L and U matrices.
*
* stat (output) SuperLUStat_t*
* Record the statistics about the factorization.
* See SuperLUStat_t structure defined in util.h.
*
* info (output) int*
* = 0: successful exit
* < 0: if info = -i, the i-th argument had an illegal value
* > 0: if info = i, U(i,i) is exactly zero. The factorization has
* been completed, but the factor U is exactly singular,
* and division by zero will occur if it is used to solve a
* system of equations.
* </pre>
*/
static void pdgstrf2
/************************************************************************/
(
superlu_options_t *options,
int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid,
LocalLU_t *Llu, SuperLUStat_t *stat, int* info
)
 
{
int c, iam, l, pkk;
int incx = 1, incy = 1;
int nsupr; /* number of rows in the block (LDA) */
int luptr;
int_t i, krow, j, jfst, jlst;
int_t nsupc; /* number of columns in the block */
int_t *xsup = Glu_persist->xsup;
double *lusup, temp;
double *ujrow;
double alpha = -1;
*info = 0;
 
/* Quick return. */
 
/* Initialization. */
iam = grid->iam;
krow = PROW( k, grid );
pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid );
j = LBj( k, grid ); /* Local block number */
jfst = FstBlockC( k );
jlst = FstBlockC( k+1 );
lusup = Llu->Lnzval_bc_ptr[j];
nsupc = SuperSize( k );
if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1];
ujrow = Llu->ujrow;
 
luptr = 0; /* Point to the diagonal entries. */
c = nsupc;
for (j = 0; j < jlst - jfst; ++j) {
/* Broadcast the j-th row (nsupc - j) elements to
the process column. */
if ( iam == pkk ) { /* Diagonal process. */
i = luptr;
if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) {
if ( fabs(lusup[i]) < thresh ) { /* Diagonal */
#if ( PRNTlevel>=2 )
printf("(%d) .. col %d, tiny pivot %e ",
iam, jfst+j, lusup[i]);
#endif
/* Keep the replaced diagonal with the same sign. */
if ( lusup[i] < 0 ) lusup[i] = -thresh;
else lusup[i] = thresh;
#if ( PRNTlevel>=2 )
printf("replaced by %e\n", lusup[i]);
#endif
++(stat->TinyPivots);
}
}
for (l = 0; l < c; ++l, i += nsupr) ujrow[l] = lusup[i];
}
#if 0
dbcast_col(ujrow, c, pkk, UjROW, grid, &c);
#else
MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm);
/*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS,
grid, COMM_COLUMN, &c);*/
#endif
 
#if ( DEBUGlevel>=2 )
if ( k == 3329 && j == 2 ) {
if ( iam == pkk ) {
printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]);
} else {
printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]);
}
}
#endif
 
if ( !lusup ) { /* Empty block column. */
--c;
if ( ujrow[0] == 0.0 ) *info = j+jfst+1;
continue;
}
 
/* Test for singularity. */
if ( ujrow[0] == 0.0 ) {
*info = j+jfst+1;
} else {
/* Scale the j-th column of the matrix. */
temp = 1.0 / ujrow[0];
if ( iam == pkk ) {
for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp;
stat->ops[FACT] += nsupr-j-1;
} else {
for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp;
stat->ops[FACT] += nsupr;
}
}
/* Rank-1 update of the trailing submatrix. */
if ( --c ) {
if ( iam == pkk ) {
l = nsupr - j - 1;
#ifdef _CRAY
SGER(&l, &c, &alpha, &lusup[luptr+1], &incx,
&ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
#else
dger_(&l, &c, &alpha, &lusup[luptr+1], &incx,
&ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr);
#endif
stat->ops[FACT] += 2 * l * c;
} else {
#ifdef _CRAY
SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx,
&ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
#else
dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx,
&ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr);
#endif
stat->ops[FACT] += 2 * nsupr * c;
}
}
/* Move to the next column. */
if ( iam == pkk ) luptr += nsupr + 1;
else luptr += nsupr;
 
} /* for j ... */
 
} /* PDGSTRF2 */
 
 
/************************************************************************/
/*! \brief
*
* <pre>
* Purpose
* =======
* Perform parallel triangular solves
* U(k,:) := A(k,:) \ L(k,k).
* Only the process column that owns block column *k* participates
* in the work.
*
* Arguments
* =========
*
* m (input) int (global)
* Number of rows in the matrix.
*
* k (input) int (global)
* The row number of the block row to be factorized.
*
* Glu_persist (input) Glu_persist_t*
* Global data structures (xsup, supno) replicated on all processes.
*
* grid (input) gridinfo_t*
* The 2D process mesh.
*
* Llu (input/output) LocalLU_t*
* Local data structures to store distributed L and U matrices.
*
* stat (output) SuperLUStat_t*
* Record the statistics about the factorization;
* See SuperLUStat_t structure defined in util.h.
* </pre>
*/
static void pdgstrs2
/************************************************************************/
#ifdef _CRAY
(
int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3
)
#else
(
int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
LocalLU_t *Llu, SuperLUStat_t *stat
)
#endif
 
{
int iam, pkk;
int incx = 1;
int nsupr; /* number of rows in the block L(:,k) (LDA) */
int segsize;
int_t nsupc; /* number of columns in the block */
int_t luptr, iukp, rukp;
int_t b, gb, j, klst, knsupc, lk, nb;
int_t *xsup = Glu_persist->xsup;
int_t *usub;
double *lusup, *uval;
 
/* Quick return. */
lk = LBi( k, grid ); /* Local block number */
if ( !Llu->Unzval_br_ptr[lk] ) return;
 
/* Initialization. */
iam = grid->iam;
pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid );
klst = FstBlockC( k+1 );
knsupc = SuperSize( k );
usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
uval = Llu->Unzval_br_ptr[lk];
nb = usub[0];
iukp = BR_HEADER;
rukp = 0;
if ( iam == pkk ) {
lk = LBj( k, grid );
nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
lusup = Llu->Lnzval_bc_ptr[lk];
} else {
nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */
lusup = Llu->Lval_buf_2[k%2];
}
 
/* Loop through all the row blocks. */
for (b = 0; b < nb; ++b) {
gb = usub[iukp];
nsupc = SuperSize( gb );
iukp += UB_DESCRIPTOR;
 
/* Loop through all the segments in the block. */
for (j = 0; j < nsupc; ++j) {
segsize = klst - usub[iukp++];
if ( segsize ) { /* Nonzero segment. */
luptr = (knsupc - segsize) * (nsupr + 1);
#ifdef _CRAY
STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr,
&uval[rukp], &incx);
#else
dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr,
&uval[rukp], &incx);
#endif
stat->ops[FACT] += segsize * (segsize + 1);
rukp += segsize;
}
}
} /* for b ... */
 
} /* PDGSTRS2 */
 
static int
probe_recv(int iam, int source, int tag, MPI_Datatype datatype, MPI_Comm comm,
int buf_size)
{
MPI_Status status;
int count;
 
MPI_Probe( source, tag, comm, &status );
MPI_Get_count( &status, datatype, &count );
if ( count > buf_size ) {
printf("(%d) Recv'ed count %d > buffer size $d\n",
iam, count, buf_size);
exit(-1);
}
return 0;
}
/trunk/OTHER/SuperLU_DIST_2.5/SRC/ddistribute.c
New file
0,0 → 1,737
 
 
/*! @file
* \brief Distribute the matrix onto the 2D process mesh.
*
* <pre>
* -- Distributed SuperLU routine (version 2.3) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 15, 2008
* </pre>
*/
#include "superlu_ddefs.h"
 
/*! \brief
*
* <pre>
* Purpose
* =======
* Distribute the matrix onto the 2D process mesh.
*
* Arguments
* =========
*
* fact (input) fact_t
* Specifies whether or not the L and U structures will be re-used.
* = SamePattern_SameRowPerm: L and U structures are input, and
* unchanged on exit.
* = DOFACT or SamePattern: L and U structures are computed and output.
*
* n (input) int
* Dimension of the matrix.
*
* A (input) SuperMatrix*
* The original matrix A, permuted by columns, of dimension
* (A->nrow, A->ncol). The type of A can be:
* Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE.
*
* LUstruct (input) LUstruct_t*
* Data structures for L and U factors.
*
* grid (input) gridinfo_t*
* The 2D process mesh.
*
* Return value
* ============
* > 0, working storage required (in bytes).
* </pre>
*/
 
float
ddistribute(fact_t fact, int_t n, SuperMatrix *A,
Glu_freeable_t *Glu_freeable,
LUstruct_t *LUstruct, gridinfo_t *grid)
{
Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
LocalLU_t *Llu = LUstruct->Llu;
int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k,
len, len1, nsupc;
int_t ljb; /* local block column number */
int_t nrbl; /* number of L blocks in current block column */
int_t nrbu; /* number of U blocks in current block column */
int_t gb; /* global block number; 0 < gb <= nsuper */
int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
int iam, jbrow, kcol, mycol, myrow, pc, pr;
int_t mybufmax[NBUFFERS];
NCPformat *Astore;
double *a;
int_t *asub;
int_t *xa_begin, *xa_end;
int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */
int_t *supno = Glu_persist->supno;
int_t *lsub, *xlsub, *usub, *xusub;
int_t nsupers;
int_t next_lind; /* next available position in index[*] */
int_t next_lval; /* next available position in nzval[*] */
int_t *index; /* indices consist of headers and row subscripts */
double *lusup, *uval; /* nonzero values in L and U */
double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */
int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
 
/*-- Counts to be used in factorization. --*/
int_t *ToRecv, *ToSendD, **ToSendR;
 
/*-- Counts to be used in lower triangular solve. --*/
int_t *fmod; /* Modification count for L-solve. */
int_t **fsendx_plist; /* Column process list to send down Xk. */
int_t nfrecvx = 0; /* Number of Xk I will receive. */
int_t nfsendx = 0; /* Number of Xk I will send */
int_t kseen;
 
/*-- Counts to be used in upper triangular solve. --*/
int_t *bmod; /* Modification count for U-solve. */
int_t **bsendx_plist; /* Column process list to send down Xk. */
int_t nbrecvx = 0; /* Number of Xk I will receive. */
int_t nbsendx = 0; /* Number of Xk I will send */
int_t *ilsum; /* starting position of each supernode in
the full array (local) */
 
/*-- Auxiliary arrays; freed on return --*/
int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */
int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */
int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */
int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */
int_t *Ucbs; /* number of column blocks in a block row */
int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */
int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */
int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */
int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */
double *dense, *dense_col; /* SPA */
double zero = 0.0;
int_t ldaspa; /* LDA of SPA */
int_t iword, dword;
float mem_use = 0.0;
 
#if ( PRNTlevel>=1 )
int_t nLblocks = 0, nUblocks = 0;
#endif
#if ( PROFlevel>=1 )
double t, t_u, t_l;
int_t u_blks;
#endif
 
/* Initialization. */
iam = grid->iam;
myrow = MYROW( iam, grid );
mycol = MYCOL( iam, grid );
for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
nsupers = supno[n-1] + 1;
Astore = A->Store;
a = Astore->nzval;
asub = Astore->rowind;
xa_begin = Astore->colbeg;
xa_end = Astore->colend;
#if ( PRNTlevel>=1 )
iword = sizeof(int_t);
dword = sizeof(double);
#endif
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Enter ddistribute()");
#endif
 
if ( fact == SamePattern_SameRowPerm ) {
/* ---------------------------------------------------------------
* REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION.
* --------------------------------------------------------------- */
 
#if ( PROFlevel>=1 )
t_l = t_u = 0; u_blks = 0;
#endif
/* We can propagate the new values of A into the existing
L and U data structures. */
ilsum = Llu->ilsum;
ldaspa = Llu->ldalsum;
if ( !(dense = doubleCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) )
ABORT("Calloc fails for SPA dense[].");
nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */
if ( !(Urb_length = intCalloc_dist(nrbu)) )
ABORT("Calloc fails for Urb_length[].");
if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
ABORT("Malloc fails for Urb_indptr[].");
Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
Unzval_br_ptr = Llu->Unzval_br_ptr;
#if ( PRNTlevel>=1 )
mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword;
#endif
#if ( PROFlevel>=1 )
t = SuperLU_timer_();
#endif
 
/* Initialize Uval to zero. */
for (lb = 0; lb < nrbu; ++lb) {
Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
index = Ufstnz_br_ptr[lb];
if ( index ) {
uval = Unzval_br_ptr[lb];
len = index[1];
for (i = 0; i < len; ++i) uval[i] = zero;
} /* if index != NULL */
} /* for lb ... */
 
for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
pc = PCOL( jb, grid );
if ( mycol == pc ) { /* Block column jb in my process column */
fsupc = FstBlockC( jb );
nsupc = SuperSize( jb );
 
/* Scatter A into SPA (for L), or into U directly. */
for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
for (i = xa_begin[j]; i < xa_end[j]; ++i) {
irow = asub[i];
gb = BlockNum( irow );
if ( myrow == PROW( gb, grid ) ) {
lb = LBi( gb, grid );
if ( gb < jb ) { /* in U */
index = Ufstnz_br_ptr[lb];
uval = Unzval_br_ptr[lb];
while ( (k = index[Urb_indptr[lb]]) < jb ) {
/* Skip nonzero values in this block */
Urb_length[lb] += index[Urb_indptr[lb]+1];
/* Move pointer to the next block */
Urb_indptr[lb] += UB_DESCRIPTOR
+ SuperSize( k );
}
/*assert(k == jb);*/
/* start fstnz */
istart = Urb_indptr[lb] + UB_DESCRIPTOR;
len = Urb_length[lb];
fsupc1 = FstBlockC( gb+1 );
k = j - fsupc;
/* Sum the lengths of the leading columns */
for (jj = 0; jj < k; ++jj)
len += fsupc1 - index[istart++];
/*assert(irow>=index[istart]);*/
uval[len + irow - index[istart]] = a[i];
} else { /* in L; put in SPA first */
irow = ilsum[lb] + irow - FstBlockC( gb );
dense_col[irow] = a[i];
}
}
} /* for i ... */
dense_col += ldaspa;
} /* for j ... */
 
#if ( PROFlevel>=1 )
t_u += SuperLU_timer_() - t;
t = SuperLU_timer_();
#endif
 
/* Gather the values of A from SPA into Lnzval[]. */
ljb = LBj( jb, grid ); /* Local block number */
index = Lrowind_bc_ptr[ljb];
if ( index ) {
nrbl = index[0]; /* Number of row blocks. */
len = index[1]; /* LDA of lusup[]. */
lusup = Lnzval_bc_ptr[ljb];
next_lind = BC_HEADER;
next_lval = 0;
for (jj = 0; jj < nrbl; ++jj) {
gb = index[next_lind++];
len1 = index[next_lind++]; /* Rows in the block. */
lb = LBi( gb, grid );
for (bnnz = 0; bnnz < len1; ++bnnz) {
irow = index[next_lind++]; /* Global index. */
irow = ilsum[lb] + irow - FstBlockC( gb );
k = next_lval++;
for (j = 0, dense_col = dense; j < nsupc; ++j) {
lusup[k] = dense_col[irow];
dense_col[irow] = zero;
k += len;
dense_col += ldaspa;
}
} /* for bnnz ... */
} /* for jj ... */
} /* if index ... */
#if ( PROFlevel>=1 )
t_l += SuperLU_timer_() - t;
#endif
} /* if mycol == pc */
} /* for jb ... */
 
SUPERLU_FREE(dense);
SUPERLU_FREE(Urb_length);
SUPERLU_FREE(Urb_indptr);
#if ( PROFlevel>=1 )
if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
t_l, t_u, u_blks, nrbu);
#endif
 
} else {
/* --------------------------------------------------
* FIRST TIME CREATING THE L AND U DATA STRUCTURE.
* -------------------------------------------------- */
 
#if ( PROFlevel>=1 )
t_l = t_u = 0; u_blks = 0;
#endif
/* No L and U data structures are available yet.
We need to set up the L and U data structures and propagate
the values of A into them. */
lsub = Glu_freeable->lsub; /* compressed L subscripts */
xlsub = Glu_freeable->xlsub;
usub = Glu_freeable->usub; /* compressed U subscripts */
xusub = Glu_freeable->xusub;
if ( !(ToRecv = intCalloc_dist(nsupers)) )
ABORT("Calloc fails for ToRecv[].");
 
k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
ABORT("Malloc fails for ToSendR[].");
j = k * grid->npcol;
if ( !(index = intMalloc_dist(j)) )
ABORT("Malloc fails for index[].");
#if ( PRNTlevel>=1 )
mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword;
#endif
for (i = 0; i < j; ++i) index[i] = EMPTY;
for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j];
k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
 
/* Pointers to the beginning of each block row of U. */
if ( !(Unzval_br_ptr =
(double**)SUPERLU_MALLOC(k * sizeof(double*))) )
ABORT("Malloc fails for Unzval_br_ptr[].");
if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Ufstnz_br_ptr[].");
if ( !(ToSendD = intCalloc_dist(k)) )
ABORT("Malloc fails for ToSendD[].");
if ( !(ilsum = intMalloc_dist(k+1)) )
ABORT("Malloc fails for ilsum[].");
 
/* Auxiliary arrays used to set up U block data structures.
They are freed on return. */
if ( !(rb_marker = intCalloc_dist(k)) )
ABORT("Calloc fails for rb_marker[].");
if ( !(Urb_length = intCalloc_dist(k)) )
ABORT("Calloc fails for Urb_length[].");
if ( !(Urb_indptr = intMalloc_dist(k)) )
ABORT("Malloc fails for Urb_indptr[].");
if ( !(Urb_fstnz = intCalloc_dist(k)) )
ABORT("Calloc fails for Urb_fstnz[].");
if ( !(Ucbs = intCalloc_dist(k)) )
ABORT("Calloc fails for Ucbs[].");
#if ( PRNTlevel>=1 )
mem_use += 2.0*k*sizeof(int_t*) + (7.0*k+1)*iword;
#endif
/* Compute ldaspa and ilsum[]. */
ldaspa = 0;
ilsum[0] = 0;
for (gb = 0; gb < nsupers; ++gb) {
if ( myrow == PROW( gb, grid ) ) {
i = SuperSize( gb );
ldaspa += i;
lb = LBi( gb, grid );
ilsum[lb + 1] = ilsum[lb] + i;
}
}
/* ------------------------------------------------------------
COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
------------------------------------------------------------*/
/* Loop through each supernode column. */
for (jb = 0; jb < nsupers; ++jb) {
pc = PCOL( jb, grid );
fsupc = FstBlockC( jb );
nsupc = SuperSize( jb );
/* Loop through each column in the block. */
for (j = fsupc; j < fsupc + nsupc; ++j) {
/* usub[*] contains only "first nonzero" in each segment. */
for (i = xusub[j]; i < xusub[j+1]; ++i) {
irow = usub[i]; /* First nonzero of the segment. */
gb = BlockNum( irow );
kcol = PCOL( gb, grid );
ljb = LBj( gb, grid );
if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
pr = PROW( gb, grid );
lb = LBi( gb, grid );
if ( mycol == pc ) {
if ( myrow == pr ) {
ToSendD[lb] = YES;
/* Count nonzeros in entire block row. */
Urb_length[lb] += FstBlockC( gb+1 ) - irow;
if (rb_marker[lb] <= jb) {/* First see the block */
rb_marker[lb] = jb + 1;
Urb_fstnz[lb] += nsupc;
++Ucbs[lb]; /* Number of column blocks
in block row lb. */
#if ( PRNTlevel>=1 )
++nUblocks;
#endif
}
ToRecv[gb] = 1;
} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
}
} /* for i ... */
} /* for j ... */
} /* for jb ... */
/* Set up the initial pointers for each block row in U. */
nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
for (lb = 0; lb < nrbu; ++lb) {
len = Urb_length[lb];
rb_marker[lb] = 0; /* Reset block marker. */
if ( len ) {
/* Add room for descriptors */
len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
if ( !(index = intMalloc_dist(len1+1)) )
ABORT("Malloc fails for Uindex[].");
Ufstnz_br_ptr[lb] = index;
if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) )
ABORT("Malloc fails for Unzval_br_ptr[*][].");
mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
index[0] = Ucbs[lb]; /* Number of column blocks */
index[1] = len; /* Total length of nzval[] */
index[2] = len1; /* Total length of index[] */
index[len1] = -1; /* End marker */
} else {
Ufstnz_br_ptr[lb] = NULL;
Unzval_br_ptr[lb] = NULL;
}
Urb_length[lb] = 0; /* Reset block length. */
Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
Urb_fstnz[lb] = BR_HEADER;
} /* for lb ... */
 
SUPERLU_FREE(Ucbs);
 
#if ( PROFlevel>=1 )
t = SuperLU_timer_() - t;
if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t);
#endif
#if ( PRNTlevel>=1 )
mem_use -= 2.0*k * iword;
#endif
/* Auxiliary arrays used to set up L block data structures.
They are freed on return.
k is the number of local row blocks. */
if ( !(Lrb_length = intCalloc_dist(k)) )
ABORT("Calloc fails for Lrb_length[].");
if ( !(Lrb_number = intMalloc_dist(k)) )
ABORT("Malloc fails for Lrb_number[].");
if ( !(Lrb_indptr = intMalloc_dist(k)) )
ABORT("Malloc fails for Lrb_indptr[].");
if ( !(Lrb_valptr = intMalloc_dist(k)) )
ABORT("Malloc fails for Lrb_valptr[].");
if (!(dense=doubleCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa)
*sp_ienv_dist(3)))))
ABORT("Calloc fails for SPA dense[].");
 
/* These counts will be used for triangular solves. */
if ( !(fmod = intCalloc_dist(k)) )
ABORT("Calloc fails for fmod[].");
if ( !(bmod = intCalloc_dist(k)) )
ABORT("Calloc fails for bmod[].");
#if ( PRNTlevel>=1 )
mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword;
#endif
k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
 
/* Pointers to the beginning of each block column of L. */
if ( !(Lnzval_bc_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
ABORT("Malloc fails for Lnzval_bc_ptr[].");
if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Lrowind_bc_ptr[].");
Lrowind_bc_ptr[k-1] = NULL;
 
/* These lists of processes will be used for triangular solves. */
if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
ABORT("Malloc fails for fsendx_plist[].");
len = k * grid->nprow;
if ( !(index = intMalloc_dist(len)) )
ABORT("Malloc fails for fsendx_plist[0]");
for (i = 0; i < len; ++i) index[i] = EMPTY;
for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
fsendx_plist[i] = &index[j];
if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
ABORT("Malloc fails for bsendx_plist[].");
if ( !(index = intMalloc_dist(len)) )
ABORT("Malloc fails for bsendx_plist[0]");
for (i = 0; i < len; ++i) index[i] = EMPTY;
for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
bsendx_plist[i] = &index[j];
#if ( PRNTlevel>=1 )
mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
#endif
/*------------------------------------------------------------
PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
------------------------------------------------------------*/
 
for (jb = 0; jb < nsupers; ++jb) {
pc = PCOL( jb, grid );
if ( mycol == pc ) { /* Block column jb in my process column */
fsupc = FstBlockC( jb );
nsupc = SuperSize( jb );
ljb = LBj( jb, grid ); /* Local block number */
/* Scatter A into SPA. */
for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){
for (i = xa_begin[j]; i < xa_end[j]; ++i) {
irow = asub[i];
gb = BlockNum( irow );
if ( myrow == PROW( gb, grid ) ) {
lb = LBi( gb, grid );
irow = ilsum[lb] + irow - FstBlockC( gb );
dense_col[irow] = a[i];
}
}
dense_col += ldaspa;
}
 
jbrow = PROW( jb, grid );
 
#if ( PROFlevel>=1 )
t = SuperLU_timer_();
#endif
/*------------------------------------------------
* SET UP U BLOCKS.
*------------------------------------------------*/
kseen = 0;
dense_col = dense;
/* Loop through each column in the block column. */
for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
istart = xusub[j];
/* NOTE: Only the first nonzero index of the segment
is stored in usub[]. */
for (i = istart; i < xusub[j+1]; ++i) {
irow = usub[i]; /* First nonzero in the segment. */
gb = BlockNum( irow );
pr = PROW( gb, grid );
if ( pr != jbrow &&
myrow == jbrow && /* diag. proc. owning jb */
bsendx_plist[ljb][pr] == EMPTY ) {
bsendx_plist[ljb][pr] = YES;
++nbsendx;
}
if ( myrow == pr ) {
lb = LBi( gb, grid ); /* Local block number */
index = Ufstnz_br_ptr[lb];
uval = Unzval_br_ptr[lb];
fsupc1 = FstBlockC( gb+1 );
if (rb_marker[lb] <= jb) { /* First time see
the block */
rb_marker[lb] = jb + 1;
Urb_indptr[lb] = Urb_fstnz[lb];;
index[Urb_indptr[lb]] = jb; /* Descriptor */
Urb_indptr[lb] += UB_DESCRIPTOR;
/* Record the first location in index[] of the
next block */
Urb_fstnz[lb] = Urb_indptr[lb] + nsupc;
len = Urb_indptr[lb];/* Start fstnz in index */
index[len-1] = 0;
for (k = 0; k < nsupc; ++k)
index[len+k] = fsupc1;
if ( gb != jb )/* Exclude diagonal block. */
++bmod[lb];/* Mod. count for back solve */
if ( kseen == 0 && myrow != jbrow ) {
++nbrecvx;
kseen = 1;
}
} else { /* Already saw the block */
len = Urb_indptr[lb];/* Start fstnz in index */
}
jj = j - fsupc;
index[len+jj] = irow;
/* Load the numerical values */
k = fsupc1 - irow; /* No. of nonzeros in segment */
index[len-1] += k; /* Increment block length in
Descriptor */
irow = ilsum[lb] + irow - FstBlockC( gb );
for (ii = 0; ii < k; ++ii) {
uval[Urb_length[lb]++] = dense_col[irow + ii];
dense_col[irow + ii] = zero;
}
} /* if myrow == pr ... */
} /* for i ... */
dense_col += ldaspa;
} /* for j ... */
 
#if ( PROFlevel>=1 )
t_u += SuperLU_timer_() - t;
t = SuperLU_timer_();
#endif
 
/*------------------------------------------------
* SET UP L BLOCKS.
*------------------------------------------------*/
 
/* Count number of blocks and length of each block. */
nrbl = 0;
len = 0; /* Number of row subscripts I own. */
kseen = 0;
istart = xlsub[fsupc];
for (i = istart; i < xlsub[fsupc+1]; ++i) {
irow = lsub[i];
gb = BlockNum( irow ); /* Global block number */
pr = PROW( gb, grid ); /* Process row owning this block */
if ( pr != jbrow &&
myrow == jbrow && /* diag. proc. owning jb */
fsendx_plist[ljb][pr] == EMPTY /* first time */ ) {
fsendx_plist[ljb][pr] = YES;
++nfsendx;
}
if ( myrow == pr ) {
lb = LBi( gb, grid ); /* Local block number */
if (rb_marker[lb] <= jb) { /* First see this block */
rb_marker[lb] = jb + 1;
Lrb_length[lb] = 1;
Lrb_number[nrbl++] = gb;
if ( gb != jb ) /* Exclude diagonal block. */
++fmod[lb]; /* Mod. count for forward solve */
if ( kseen == 0 && myrow != jbrow ) {
++nfrecvx;
kseen = 1;
}
#if ( PRNTlevel>=1 )
++nLblocks;
#endif
} else {
++Lrb_length[lb];
}
++len;
}
} /* for i ... */
 
if ( nrbl ) { /* Do not ensure the blocks are sorted! */
/* Set up the initial pointers for each block in
index[] and nzval[]. */
/* Add room for descriptors */
len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
if ( !(index = intMalloc_dist(len1)) )
ABORT("Malloc fails for index[]");
Lrowind_bc_ptr[ljb] = index;
if (!(Lnzval_bc_ptr[ljb] = doubleMalloc_dist(((size_t)len)*nsupc))) {
fprintf(stderr, "col block %d ", jb);
ABORT("Malloc fails for Lnzval_bc_ptr[*][]");
}
mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
index[0] = nrbl; /* Number of row blocks */
index[1] = len; /* LDA of the nzval[] */
next_lind = BC_HEADER;
next_lval = 0;
for (k = 0; k < nrbl; ++k) {
gb = Lrb_number[k];
lb = LBi( gb, grid );
len = Lrb_length[lb];
Lrb_length[lb] = 0; /* Reset vector of block length */
index[next_lind++] = gb; /* Descriptor */
index[next_lind++] = len;
Lrb_indptr[lb] = next_lind;
Lrb_valptr[lb] = next_lval;
next_lind += len;
next_lval += len;
}
/* Propagate the compressed row subscripts to Lindex[], and
the initial values of A from SPA into Lnzval[]. */
lusup = Lnzval_bc_ptr[ljb];
len = index[1]; /* LDA of lusup[] */
for (i = istart; i < xlsub[fsupc+1]; ++i) {
irow = lsub[i];
gb = BlockNum( irow );
if ( myrow == PROW( gb, grid ) ) {
lb = LBi( gb, grid );
k = Lrb_indptr[lb]++; /* Random access a block */
index[k] = irow;
k = Lrb_valptr[lb]++;
irow = ilsum[lb] + irow - FstBlockC( gb );
for (j = 0, dense_col = dense; j < nsupc; ++j) {
lusup[k] = dense_col[irow];
dense_col[irow] = 0.0;
k += len;
dense_col += ldaspa;
}
}
} /* for i ... */
} else {
Lrowind_bc_ptr[ljb] = NULL;
Lnzval_bc_ptr[ljb] = NULL;
} /* if nrbl ... */
#if ( PROFlevel>=1 )
t_l += SuperLU_timer_() - t;
#endif
} /* if mycol == pc */
 
} /* for jb ... */
 
Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
Llu->Unzval_br_ptr = Unzval_br_ptr;
Llu->ToRecv = ToRecv;
Llu->ToSendD = ToSendD;
Llu->ToSendR = ToSendR;
Llu->fmod = fmod;
Llu->fsendx_plist = fsendx_plist;
Llu->nfrecvx = nfrecvx;
Llu->nfsendx = nfsendx;
Llu->bmod = bmod;
Llu->bsendx_plist = bsendx_plist;
Llu->nbrecvx = nbrecvx;
Llu->nbsendx = nbsendx;
Llu->ilsum = ilsum;
Llu->ldalsum = ldaspa;
#if ( PRNTlevel>=1 )
if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n",
nLblocks, nUblocks);
#endif
 
SUPERLU_FREE(rb_marker);
SUPERLU_FREE(Urb_fstnz);
SUPERLU_FREE(Urb_length);
SUPERLU_FREE(Urb_indptr);
SUPERLU_FREE(Lrb_length);
SUPERLU_FREE(Lrb_number);
SUPERLU_FREE(Lrb_indptr);
SUPERLU_FREE(Lrb_valptr);
SUPERLU_FREE(dense);
 
k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
if ( !(Llu->mod_bit = intMalloc_dist(k)) )
ABORT("Malloc fails for mod_bit[].");
 
/* Find the maximum buffer size. */
MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t,
MPI_MAX, grid->comm);
 
#if ( PROFlevel>=1 )
if ( !iam ) printf(".. 1st distribute time:\n "
"\tL\t%.2f\n\tU\t%.2f\n"
"\tu_blks %d\tnrbu %d\n--------\n",
t_l, t_u, u_blks, nrbu);
#endif
 
} /* else fact != SamePattern_SameRowPerm */
 
#if ( DEBUGlevel>=1 )
/* Memory allocated but not freed:
ilsum, fmod, fsendx_plist, bmod, bsendx_plist */
CHECK_MALLOC(iam, "Exit ddistribute()");
#endif
 
return (mem_use);
} /* DDISTRIBUTE */
 
/trunk/OTHER/SuperLU_DIST_2.5/SRC/pdgstrs_Bglobal_Bsend.c
New file
0,0 → 1,1007
/*! @file
* \brief Solves a system of distributed linear equations
*
* <pre>
* -- Distributed SuperLU routine (version 1.0) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* September 1, 1999
*
* Modified:
* Feburary 7, 2001 use MPI_Isend/MPI_Irecv
* October 2, 2001 use MPI_Isend/MPI_Irecv with MPI_Test
* </pre>
*/
 
#include "superlu_ddefs.h"
 
 
/*#define ISEND_IRECV*/
 
/* Parry's change
Use MPI_Bsend with a large buffer attached in the main program */
#define BSEND 1
 
/*
* Function prototypes
*/
#ifdef _CRAY
fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*,
double*, int*, double*, int*);
fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*,
int*, double*, int*, double*, double*, int*);
_fcd ftcs1;
_fcd ftcs2;
_fcd ftcs3;
#endif
 
/*! \brief
*
* <pre>
* Purpose
* =======
*
* pdgstrs_Bglobal solves a system of distributed linear equations
* A*X = B with a general N-by-N matrix A using the LU factorization
* computed by pdgstrf.
*
* Arguments
* =========
*
* n (input) int (global)
* The order of the system of linear equations.
*
* LUstruct (input) LUstruct_t*
* The distributed data structures storing L and U factors.
* The L and U factors are obtained from pdgstrf for
* the possibly scaled and permuted matrix A.
* See superlu_ddefs.h for the definition of 'LUstruct_t'.
*
* grid (input) gridinfo_t*
* The 2D process mesh. It contains the MPI communicator, the number
* of process rows (NPROW), the number of process columns (NPCOL),
* and my process rank. It is an input argument to all the
* parallel routines.
* Grid can be initialized by subroutine SUPERLU_GRIDINIT.
* See superlu_ddefs.h for the definition of 'gridinfo_t'.
*
* B (input/output) double*
* On entry, the right-hand side matrix of the possibly equilibrated
* and row permuted system.
* On exit, the solution matrix of the possibly equilibrated
* and row permuted system if info = 0;
*
* NOTE: Currently, the N-by-NRHS matrix B must reside on all
* processes when calling this routine.
*
* ldb (input) int (global)
* Leading dimension of matrix B.
*
* nrhs (input) int (global)
* Number of right-hand sides.
*
* stat (output) SuperLUStat_t*
* Record the statistics about the triangular solves.
* See util.h for the definition of 'SuperLUStat_t'.
*
* info (output) int*
* = 0: successful exit
* < 0: if info = -i, the i-th argument had an illegal value
* </pre>
*/
void
pdgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, double *B,
int_t ldb, int nrhs, SuperLUStat_t *stat, int *info)
{
 
Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
LocalLU_t *Llu = LUstruct->Llu;
double alpha = 1.0;
double *lsum; /* Local running sum of the updates to B-components */
double *x; /* X component at step k. */
double *lusup, *dest;
double *recvbuf, *tempv;
double *rtemp; /* Result of full matrix-vector multiply. */
int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */
int_t iam, kcol, krow, mycol, myrow;
int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr;
int_t nb, nlb, nub, nsupers;
int_t *xsup, *lsub, *usub;
int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/
int_t Pc, Pr;
int knsupc, nsupr;
int ldalsum; /* Number of lsum entries locally owned. */
int maxrecvsz, p, pi;
int_t **Lrowind_bc_ptr;
double **Lnzval_bc_ptr;
MPI_Status status;
#if defined(ISEND_IRECV) || defined(BSEND)
MPI_Request *send_req, recv_req;
int test_flag;
#endif
 
/*-- Counts used for L-solve --*/
int_t *fmod; /* Modification count for L-solve. */
int_t **fsendx_plist = Llu->fsendx_plist;
int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */
int_t *frecv; /* Count of modifications to be recv'd from
processes in this row. */
int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */
int_t nleaf = 0, nroot = 0;
 
/*-- Counts used for U-solve --*/
int_t *bmod; /* Modification count for L-solve. */
int_t **bsendx_plist = Llu->bsendx_plist;
int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */
int_t *brecv; /* Count of modifications to be recv'd from
processes in this row. */
int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */
double t;
#if ( DEBUGlevel>=2 )
int_t Ublocks = 0;
#endif
/*-- Function prototypes --*/
extern void gather_diag_to_all(int_t, int_t, double [], Glu_persist_t *,
LocalLU_t *, gridinfo_t *, int_t, int_t [],
int_t [], double [], int_t, double []);
 
t = SuperLU_timer_();
 
/* Test input parameters. */
*info = 0;
if ( n < 0 ) *info = -1;
else if ( nrhs < 0 ) *info = -9;
if ( *info ) {
pxerbla("PDGSTRS_BGLOBAL", grid, -*info);
return;
}
/*
* Initialization.
*/
iam = grid->iam;
#ifdef BSEND
if(!iam) {
printf("Using MPI_Bsend in triangular solve\n");
fflush(stdout);
}
#endif
Pc = grid->npcol;
Pr = grid->nprow;
myrow = MYROW( iam, grid );
mycol = MYCOL( iam, grid );
nsupers = Glu_persist->supno[n-1] + 1;
xsup = Glu_persist->xsup;
Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */
 
stat->ops[SOLVE] = 0.0;
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Enter pdgstrs_Bglobal()");
#endif
 
/* Save the count to be altered so it can be used by
subsequent call to PDGSTRS_BGLOBAL. */
if ( !(fmod = intMalloc_dist(nlb)) )
ABORT("Calloc fails for fmod[].");
for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i];
if ( !(frecv = intMalloc_dist(nlb)) )
ABORT("Malloc fails for frecv[].");
Llu->frecv = frecv;
 
#if defined(ISEND_IRECV) || defined(BSEND)
if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(Pr*sizeof(MPI_Request))) )
ABORT("Malloc fails for send_req[].");
for (i = 0; i < Pr; ++i) send_req[i] = MPI_REQUEST_NULL;
#endif
 
#ifdef _CRAY
ftcs1 = _cptofcd("L", strlen("L"));
ftcs2 = _cptofcd("N", strlen("N"));
ftcs3 = _cptofcd("U", strlen("U"));
#endif
 
 
/* Obtain ilsum[] and ldalsum for process column 0. */
ilsum = Llu->ilsum;
ldalsum = Llu->ldalsum;
 
/* Allocate working storage. */
knsupc = sp_ienv_dist(3);
maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H );
if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs + nlb * LSUM_H)))
ABORT("Calloc fails for lsum[].");
if ( !(x = doubleMalloc_dist(ldalsum * nrhs + nlb * XK_H)) )
ABORT("Malloc fails for x[].");
if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) )
ABORT("Malloc fails for recvbuf[].");
if ( !(rtemp = doubleMalloc_dist(maxrecvsz)) )
ABORT("Malloc fails for rtemp[].");
 
 
/*---------------------------------------------------
* Forward solve Ly = b.
*---------------------------------------------------*/
 
/*
* Copy B into X on the diagonal processes.
*/
ii = 0;
for (k = 0; k < nsupers; ++k) {
knsupc = SuperSize( k );
krow = PROW( k, grid );
if ( myrow == krow ) {
lk = LBi( k, grid ); /* Local block number. */
il = LSUM_BLK( lk );
lsum[il - LSUM_H] = k; /* Block number prepended in the header. */
kcol = PCOL( k, grid );
if ( mycol == kcol ) { /* Diagonal process. */
jj = X_BLK( lk );
x[jj - XK_H] = k; /* Block number prepended in the header. */
RHS_ITERATE(j)
for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */
x[i + jj + j*knsupc] = B[i + ii + j*ldb];
}
}
ii += knsupc;
}
 
/*
* Compute frecv[] and nfrecvmod counts on the diagonal processes.
*/
{
superlu_scope_t *scp = &grid->rscp;
 
for (k = 0; k < nsupers; ++k) {
krow = PROW( k, grid );
if ( myrow == krow ) {
lk = LBi( k, grid ); /* Local block number. */
kcol = PCOL( k, grid ); /* Root process in this row scope. */
if ( mycol != kcol && fmod[lk] )
i = 1; /* Contribution from non-diagonal process. */
else i = 0;
MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t,
MPI_SUM, kcol, scp->comm );
if ( mycol == kcol ) { /* Diagonal process. */
nfrecvmod += frecv[lk];
if ( !frecv[lk] && !fmod[lk] ) ++nleaf;
#if ( DEBUGlevel>=2 )
printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]);
assert( frecv[lk] < Pc );
#endif
}
}
}
}
 
/* ---------------------------------------------------------
Solve the leaf nodes first by all the diagonal processes.
--------------------------------------------------------- */
#if ( DEBUGlevel>=1 )
printf("(%2d) nleaf %4d\n", iam, nleaf);
#endif
for (k = 0; k < nsupers && nleaf; ++k) {
krow = PROW( k, grid );
kcol = PCOL( k, grid );
if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
knsupc = SuperSize( k );
lk = LBi( k, grid );
if ( frecv[lk]==0 && fmod[lk]==0 ) {
fmod[lk] = -1; /* Do not solve X[k] in the future. */
ii = X_BLK( lk );
lk = LBj( k, grid ); /* Local block number, column-wise. */
lsub = Lrowind_bc_ptr[lk];
lusup = Lnzval_bc_ptr[lk];
nsupr = lsub[1];
#ifdef _CRAY
STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
lusup, &nsupr, &x[ii], &knsupc);
#else
dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
lusup, &nsupr, &x[ii], &knsupc);
#endif
stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
--nleaf;
#if ( DEBUGlevel>=2 )
printf("(%2d) Solve X[%2d]\n", iam, k);
#endif
/*
* Send Xk to process column Pc[k].
*/
for (p = 0; p < Pr; ++p)
if ( fsendx_plist[lk][p] != EMPTY ) {
pi = PNUM( p, kcol, grid );
#ifdef ISEND_IRECV
#if 1
MPI_Test( &send_req[p], &test_flag, &status );
#else
if ( send_req[p] != MPI_REQUEST_NULL )
MPI_Wait( &send_req[p], &status );
#endif
MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
MPI_DOUBLE, pi, Xk, grid->comm, &send_req[p]);
#else
#ifdef BSEND
MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
MPI_DOUBLE, pi, Xk, grid->comm );
#else
MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
MPI_DOUBLE, pi, Xk, grid->comm );
#endif
#endif
#if ( DEBUGlevel>=2 )
printf("(%2d) Sent X[%2.0f] to P %2d\n",
iam, x[ii-XK_H], pi);
#endif
}
/*
* Perform local block modifications: lsum[i] -= L_i,k * X[k]
*/
nb = lsub[0] - 1;
lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
luptr = knsupc; /* Skip diagonal block L(k,k). */
dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
fmod, nb, lptr, luptr, xsup, grid, Llu,
send_req,stat);
#ifdef ISEND_IRECV
/* Wait for previous Isends to complete. */
for (p = 0; p < Pr; ++p) {
if ( fsendx_plist[lk][p] != EMPTY )
/*MPI_Wait( &send_req[p], &status );*/
MPI_Test( &send_req[p], &test_flag, &status );
}
#endif
}
} /* if diagonal process ... */
} /* for k ... */
 
/* -----------------------------------------------------------
Compute the internal nodes asynchronously by all processes.
----------------------------------------------------------- */
#if ( DEBUGlevel>=1 )
printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n",
iam, nfrecvx, nfrecvmod, nleaf);
#endif
 
while ( nfrecvx || nfrecvmod ) { /* While not finished. */
 
/* Receive a message. */
#ifdef ISEND_IRECV
/* -MPI- FATAL: Remote protocol queue full */
MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
MPI_ANY_TAG, grid->comm, &recv_req );
MPI_Wait( &recv_req, &status );
#else
MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
MPI_ANY_TAG, grid->comm, &status );
#endif
 
k = *recvbuf;
 
#if ( DEBUGlevel>=2 )
printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
#endif
switch ( status.MPI_TAG ) {
case Xk:
--nfrecvx;
lk = LBj( k, grid ); /* Local block number, column-wise. */
lsub = Lrowind_bc_ptr[lk];
lusup = Lnzval_bc_ptr[lk];
if ( lsub ) {
nb = lsub[0];
lptr = BC_HEADER;
luptr = 0;
knsupc = SuperSize( k );
 
/*
* Perform local block modifications: lsum[i] -= L_i,k * X[k]
*/
dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k,
fmod, nb, lptr, luptr, xsup, grid, Llu,
send_req, stat);
} /* if lsub */
 
break;
 
case LSUM:
--nfrecvmod;
lk = LBi( k, grid ); /* Local block number, row-wise. */
ii = X_BLK( lk );
knsupc = SuperSize( k );
tempv = &recvbuf[LSUM_H];
RHS_ITERATE(j)
for (i = 0; i < knsupc; ++i)
x[i + ii + j*knsupc] += tempv[i + j*knsupc];
 
if ( (--frecv[lk])==0 && fmod[lk]==0 ) {
fmod[lk] = -1; /* Do not solve X[k] in the future. */
lk = LBj( k, grid ); /* Local block number, column-wise. */
lsub = Lrowind_bc_ptr[lk];
lusup = Lnzval_bc_ptr[lk];
nsupr = lsub[1];
#ifdef _CRAY
STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha,
lusup, &nsupr, &x[ii], &knsupc);
#else
dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha,
lusup, &nsupr, &x[ii], &knsupc);
#endif
stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;
#if ( DEBUGlevel>=2 )
printf("(%2d) Solve X[%2d]\n", iam, k);
#endif
/*
* Send Xk to process column Pc[k].
*/
kcol = PCOL( k, grid );
for (p = 0; p < Pr; ++p)
if ( fsendx_plist[lk][p] != EMPTY ) {
pi = PNUM( p, kcol, grid );
#ifdef ISEND_IRECV
#if 1
MPI_Test( &send_req[p], &test_flag, &status );
#else
if ( send_req[p] != MPI_REQUEST_NULL )
MPI_Wait( &send_req[p], &status );
#endif
MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H,
MPI_DOUBLE, pi, Xk, grid->comm,
&send_req[p]);
#else
#ifdef BSEND
MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
MPI_DOUBLE, pi, Xk, grid->comm );
#else
MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
MPI_DOUBLE, pi, Xk, grid->comm );
#endif
#endif
#if ( DEBUGlevel>=2 )
printf("(%2d) Sent X[%2.0f] to P %2d\n",
iam, x[ii-XK_H], pi);
#endif
}
 
/*
* Perform local block modifications.
*/
nb = lsub[0] - 1;
lptr = BC_HEADER + LB_DESCRIPTOR + knsupc;
luptr = knsupc; /* Skip diagonal block L(k,k). */
 
dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k,
fmod, nb, lptr, luptr, xsup, grid, Llu,
send_req, stat);
#ifdef ISEND_IRECV
/* Wait for the previous Isends to complete. */
for (p = 0; p < Pr; ++p) {
if ( fsendx_plist[lk][p] != EMPTY )
MPI_Test( &send_req[p], &test_flag, &status );
}
#endif
} /* if */
 
break;
 
#if ( DEBUGlevel>=1 )
default:
printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
break;
#endif
} /* switch */
 
} /* while not finished ... */
 
 
#if ( PRNTlevel>=2 )
t = SuperLU_timer_() - t;
if ( !iam ) printf(".. L-solve time\t%8.2f\n", t);
t = SuperLU_timer_();
#endif
 
#if ( PRNTlevel==2 )
if ( !iam ) printf("\n.. After L-solve: y =\n");
for (i = 0, k = 0; k < nsupers; ++k) {
krow = PROW( k, grid );
kcol = PCOL( k, grid );
if ( myrow == krow && mycol == kcol ) { /* Diagonal process */
knsupc = SuperSize( k );
lk = LBi( k, grid );
ii = X_BLK( lk );
for (j = 0; j < knsupc; ++j)
printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]);
}
MPI_Barrier( grid->comm );
}
#endif
 
SUPERLU_FREE(fmod);
SUPERLU_FREE(frecv);
SUPERLU_FREE(rtemp);
 
/* MPI_Barrier( grid->comm ); Drain messages in the forward solve. */
 
 
/*---------------------------------------------------
* Back solve Ux = y.
*
* The Y components from the forward solve is already
* on the diagonal processes.
*---------------------------------------------------*/
 
/* Save the count to be altered so it can be used by
subsequent call to PDGSTRS_BGLOBAL. */
if ( !(bmod = intMalloc_dist(nlb)) )
ABORT("Calloc fails for bmod[].");
for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i];
if ( !(brecv = intMalloc_dist(nlb)) )
ABORT("Malloc fails for brecv[].");
Llu->brecv = brecv;
 
/*
* Compute brecv[] and nbrecvmod counts on the diagonal processes.
*/
{
superlu_scope_t *scp = &grid->rscp;
 
for (k = 0; k < nsupers; ++k) {
krow = PROW( k, grid );
if ( myrow == krow ) {
lk = LBi( k, grid ); /* Local block number. */
kcol = PCOL( k, grid ); /* Root process in this row scope. */
if ( mycol != kcol && bmod[lk] )
i = 1; /* Contribution from non-diagonal process. */
else i = 0;
MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t,
MPI_SUM, kcol, scp->comm );
if ( mycol == kcol ) { /* Diagonal process. */
nbrecvmod += brecv[lk];
if ( !brecv[lk] && !bmod[lk] ) ++nroot;
#if ( DEBUGlevel>=2 )
printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]);
assert( brecv[lk] < Pc );
#endif
}
}
}
}
 
/* Re-initialize lsum to zero. Each block header is already in place. */
for (k = 0; k < nsupers; ++k) {
krow = PROW( k, grid );
if ( myrow == krow ) {
knsupc = SuperSize( k );
lk = LBi( k, grid );
il = LSUM_BLK( lk );
dest = &lsum[il];
RHS_ITERATE(j)
for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0;
}
}
 
/* Set up additional pointers for the index and value arrays of U.
nlb is the number of local block rows. */
nub = CEILING( nsupers, Pc ); /* Number of local block columns. */
if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) )
ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
blocks in a block column. */
Urbs1 = Urbs + nub;
if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
ABORT("Malloc fails for Ucb_indptr[]");
if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
ABORT("Malloc fails for Ucb_valptr[]");
 
/* Count number of row blocks in a block column.
One pass of the skeleton graph of U. */
for (lk = 0; lk < nlb; ++lk) {
usub = Ufstnz_br_ptr[lk];
if ( usub ) { /* Not an empty block row. */
/* usub[0] -- number of column blocks in this block row. */
#if ( DEBUGlevel>=2 )
Ublocks += usub[0];
#endif
i = BR_HEADER; /* Pointer in index array. */
for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
k = usub[i]; /* Global block number */
++Urbs[LBj(k,grid)];
i += UB_DESCRIPTOR + SuperSize( k );
}
}
}
 
/* Set up the vertical linked lists for the row blocks.
One pass of the skeleton graph of U. */
for (lb = 0; lb < nub; ++lb)
if ( Urbs[lb] ) { /* Not an empty block column. */
if ( !(Ucb_indptr[lb]
= SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
ABORT("Malloc fails for Ucb_indptr[lb][]");
if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
ABORT("Malloc fails for Ucb_valptr[lb][]");
}
for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
usub = Ufstnz_br_ptr[lk];
if ( usub ) { /* Not an empty block row. */
i = BR_HEADER; /* Pointer in index array. */
j = 0; /* Pointer in nzval array. */
for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */
k = usub[i]; /* Global block number, column-wise. */
ljb = LBj( k, grid ); /* Local block number, column-wise. */
Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
Ucb_valptr[ljb][Urbs1[ljb]] = j;
++Urbs1[ljb];
j += usub[i+1];
i += UB_DESCRIPTOR + SuperSize( k );
}
}
}
 
#if ( DEBUGlevel>=2 )
for (p = 0; p < Pr*Pc; ++p) {
if (iam == p) {
printf("(%2d) .. Ublocks %d\n", iam, Ublocks);
for (lb = 0; lb < nub; ++lb) {
printf("(%2d) Local col %2d: # row blocks %2d\n",
iam, lb, Urbs[lb]);
if ( Urbs[lb] ) {
for (i = 0; i < Urbs[lb]; ++i)
printf("(%2d) .. row blk %2d:\
lbnum %d, indpos %d, valpos %d\n",
iam, i,
Ucb_indptr[lb][i].lbnum,
Ucb_indptr[lb][i].indpos,
Ucb_valptr[lb][i]);
}
}
}
MPI_Barrier( grid->comm );
}
for (p = 0; p < Pr*Pc; ++p) {
if ( iam == p ) {
printf("\n(%d) bsendx_plist[][]", iam);
for (lb = 0; lb < nub; ++lb) {
printf("\n(%d) .. local col %2d: ", iam, lb);
for (i = 0; i < Pr; ++i)
printf("%4d", bsendx_plist[lb][i]);
}
printf("\n");
}
MPI_Barrier( grid->comm );
}
#endif /* DEBUGlevel */
 
 
#if ( PRNTlevel>=3 )
t = SuperLU_timer_() - t;
if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t);
t = SuperLU_timer_();
#endif
 
/*
* Solve the roots first by all the diagonal processes.
*/
#if ( DEBUGlevel>=1 )
printf("(%2d) nroot %4d\n", iam, nroot);
#endif
for (k = nsupers-1; k >= 0 && nroot; --k) {
krow = PROW( k, grid );
kcol = PCOL( k, grid );
if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */
knsupc = SuperSize( k );
lk = LBi( k, grid ); /* Local block number, row-wise. */
if ( brecv[lk]==0 && bmod[lk]==0 ) {
bmod[lk] = -1; /* Do not solve X[k] in the future. */
ii = X_BLK( lk );
lk = LBj( k, grid ); /* Local block number, column-wise */
lsub = Lrowind_bc_ptr[lk];
lusup = Lnzval_bc_ptr[lk];
nsupr = lsub[1];
#ifdef _CRAY
STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
lusup, &nsupr, &x[ii], &knsupc);
#else
dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
lusup, &nsupr, &x[ii], &knsupc);
#endif
stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
--nroot;
#if ( DEBUGlevel>=2 )
printf("(%2d) Solve X[%2d]\n", iam, k);
#endif
/*
* Send Xk to process column Pc[k].
*/
for (p = 0; p < Pr; ++p)
if ( bsendx_plist[lk][p] != EMPTY ) {
pi = PNUM( p, kcol, grid );
#ifdef ISEND_IRECV
#if 1
MPI_Test( &send_req[p], &test_flag, &status );
#else
if ( send_req[p] != MPI_REQUEST_NULL )
MPI_Wait( &send_req[p], &status );
#endif
MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
MPI_DOUBLE, pi, Xk, grid->comm, &send_req[p]);
#else
#ifdef BSEND
MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
MPI_DOUBLE, pi, Xk, grid->comm );
#else
MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
MPI_DOUBLE, pi, Xk, grid->comm );
#endif
#endif
#if ( DEBUGlevel>=2 )
printf("(%2d) Sent X[%2.0f] to P %2d\n",
iam, x[ii-XK_H], pi);
#endif
}
/*
* Perform local block modifications: lsum[i] -= U_i,k * X[k]
*/
if ( Urbs[lk] )
dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
send_req, stat);
#ifdef ISEND_IRECV
/* Wait for the previous Isends to complete. */
for (p = 0; p < Pr; ++p) {
if ( bsendx_plist[lk][p] != EMPTY )
/*MPI_Wait( &send_req[p], &status );*/
MPI_Test( &send_req[p], &test_flag, &status );
}
#endif
} /* if root ... */
} /* if diagonal process ... */
} /* for k ... */
 
 
/*
* Compute the internal nodes asychronously by all processes.
*/
while ( nbrecvx || nbrecvmod ) { /* While not finished. */
 
/* Receive a message. */
MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE,
MPI_ANY_TAG, grid->comm, &status );
k = *recvbuf;
 
#if ( DEBUGlevel>=2 )
printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG);
#endif
 
switch ( status.MPI_TAG ) {
case Xk:
--nbrecvx;
lk = LBj( k, grid ); /* Local block number, column-wise. */
/*
* Perform local block modifications:
* lsum[i] -= U_i,k * X[k]
*/
dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs,
Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
send_req, stat);
 
break;
 
case LSUM:
--nbrecvmod;
lk = LBi( k, grid ); /* Local block number, row-wise. */
ii = X_BLK( lk );
knsupc = SuperSize( k );
tempv = &recvbuf[LSUM_H];
RHS_ITERATE(j)
for (i = 0; i < knsupc; ++i)
x[i + ii + j*knsupc] += tempv[i + j*knsupc];
 
if ( (--brecv[lk])==0 && bmod[lk]==0 ) {
bmod[lk] = -1; /* Do not solve X[k] in the future. */
lk = LBj( k, grid ); /* Local block number, column-wise. */
lsub = Lrowind_bc_ptr[lk];
lusup = Lnzval_bc_ptr[lk];
nsupr = lsub[1];
#ifdef _CRAY
STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha,
lusup, &nsupr, &x[ii], &knsupc);
#else
dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha,
lusup, &nsupr, &x[ii], &knsupc);
#endif
stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;
#if ( DEBUGlevel>=2 )
printf("(%2d) Solve X[%2d]\n", iam, k);
#endif
/*
* Send Xk to process column Pc[k].
*/
kcol = PCOL( k, grid );
for (p = 0; p < Pr; ++p)
if ( bsendx_plist[lk][p] != EMPTY ) {
pi = PNUM( p, kcol, grid );
#ifdef ISEND_IRECV
#if 1
MPI_Test( &send_req[p], &test_flag, &status );
#else
if ( send_req[p] != MPI_REQUEST_NULL )
MPI_Wait( &send_req[p], &status );
#endif
MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H,
MPI_DOUBLE, pi, Xk, grid->comm,
&send_req[p] );
#else
#ifdef BSEND
MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H,
MPI_DOUBLE, pi, Xk, grid->comm );
#else
MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H,
MPI_DOUBLE, pi, Xk, grid->comm );
#endif
#endif
#if ( DEBUGlevel>=2 )
printf("(%2d) Sent X[%2.0f] to P %2d\n",
iam, x[ii - XK_H], pi);
#endif
}
/*
* Perform local block modifications:
* lsum[i] -= U_i,k * X[k]
*/
if ( Urbs[lk] )
dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs,
Ucb_indptr, Ucb_valptr, xsup, grid, Llu,
send_req, stat);
#ifdef ISEND_IRECV
/* Wait for the previous Isends to complete. */
for (p = 0; p < Pr; ++p) {
if ( bsendx_plist[lk][p] != EMPTY )
/*MPI_Wait( &send_req[p], &status );*/
MPI_Test( &send_req[p], &test_flag, &status );
}
#endif
} /* if becomes solvable */
break;
 
#if ( DEBUGlevel>=1 )
default:
printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG);
break;
#endif
 
} /* switch */
 
} /* while not finished ... */
 
#if ( PRNTlevel>=3 )
t = SuperLU_timer_() - t;
if ( !iam ) printf(".. U-solve time\t%8.2f\n", t);
#endif
 
 
/* Copy the solution X into B (on all processes). */
{
int_t num_diag_procs, *diag_procs, *diag_len;
double *work;
 
get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
&diag_procs, &diag_len);
jj = diag_len[0];
for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]);
if ( !(work = doubleMalloc_dist(jj*nrhs)) )
ABORT("Malloc fails for work[]");
gather_diag_to_all(n, nrhs, x, Glu_persist, Llu,
grid, num_diag_procs, diag_procs, diag_len,
B, ldb, work);
SUPERLU_FREE(diag_procs);
SUPERLU_FREE(diag_len);
SUPERLU_FREE(work);
}
 
/* Deallocate storage. */
 
SUPERLU_FREE(lsum);
SUPERLU_FREE(x);
SUPERLU_FREE(recvbuf);
for (i = 0; i < nub; ++i)
if ( Urbs[i] ) {
SUPERLU_FREE(Ucb_indptr[i]);
SUPERLU_FREE(Ucb_valptr[i]);
}
SUPERLU_FREE(Ucb_indptr);
SUPERLU_FREE(Ucb_valptr);
SUPERLU_FREE(Urbs);
SUPERLU_FREE(bmod);
SUPERLU_FREE(brecv);
#ifdef ISEND_IRECV
for (p = 0; p < Pr; ++p) {
if ( send_req[p] != MPI_REQUEST_NULL )
MPI_Wait( &send_req[p], &status );
}
SUPERLU_FREE(send_req);
#endif
 
stat->utime[SOLVE] = SuperLU_timer_() - t;
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit pdgstrs_Bglobal()");
#endif
/* Chao debug */
 
MPI_Barrier( grid->comm ); /* Drain messages in the forward solve. */
 
} /* PDGSTRS_BGLOBAL */
 
 
/*! \brief
*
* <pre>
* Gather the components of x vector on the diagonal processes
* onto all processes, and combine them into the global vector y.
* </pre>
*/
static void
gather_diag_to_all(int_t n, int_t nrhs, double x[],
Glu_persist_t *Glu_persist, LocalLU_t *Llu,
gridinfo_t *grid, int_t num_diag_procs,
int_t diag_procs[], int_t diag_len[],
double y[], int_t ldy, double work[])
{
int_t i, ii, j, k, lk, lwork, nsupers, p;
int_t *ilsum, *xsup;
int iam, knsupc, pkk;
double *x_col, *y_col;
iam = grid->iam;
nsupers = Glu_persist->supno[n-1] + 1;
xsup = Glu_persist->xsup;
ilsum = Llu->ilsum;
 
for (p = 0; p < num_diag_procs; ++p) {
pkk = diag_procs[p];
if ( iam == pkk ) {
/* Copy x vector into a buffer. */
lwork = 0;
for (k = p; k < nsupers; k += num_diag_procs) {
knsupc = SuperSize( k );
lk = LBi( k, grid );
ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/
x_col = &x[ii];
for (j = 0; j < nrhs; ++j) {
for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i];
lwork += knsupc;
x_col += knsupc;
}
}
MPI_Bcast( work, lwork, MPI_DOUBLE, pkk, grid->comm );
} else {
MPI_Bcast( work, diag_len[p]*nrhs, MPI_DOUBLE, pkk, grid->comm );
}
/* Scatter work[] into global y vector. */
lwork = 0;
for (k = p; k < nsupers; k += num_diag_procs) {
knsupc = SuperSize( k );
ii = FstBlockC( k );
y_col = &y[ii];
for (j = 0; j < nrhs; ++j) {
for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork];
lwork += knsupc;
y_col += ldy;
}
}
}
} /* GATHER_DIAG_TO_ALL */
/trunk/OTHER/SuperLU_DIST_2.5/SRC/temp
New file
0,0 → 1,296
1,4c1,2
< /*! @file
< * \brief Implements parallel symbolic factorization
< *
< * <pre>
---
>
> /*
12c10,13
< *
---
> */
>
>
> /*
19c20
< * </pre>
---
> *
22,23d22
<
<
130,132c129,147
< /*! \brief
< *
< * <pre>
---
> float symbfact_dist
> /************************************************************************/
> (
> int nprocs_num, /* Input - no of processors */
> int nprocs_symb, /* Input - no of processors for the symbolic
> factorization */
> SuperMatrix *A, /* Input - distributed input matrix */
> int_t *perm_c, /* Input - column permutation */
> int_t *perm_r, /* Input - row permutation */
> int_t *sizes, /* Input - sizes of each node in the separator tree */
> int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
> Pslu_freeable_t *Pslu_freeable, /* Output - local L and U structure,
> global to local indexing information */
> MPI_Comm *num_comm, /* Input - communicator for numerical factorization */
> MPI_Comm *symb_comm, /* Input - communicator for symbolic factorization */
> mem_usage_t *symb_mem_usage
> )
> {
> /*
214c229
< * </pre>
---
> *
216,234d230
< float symbfact_dist
< /************************************************************************/
< (
< int nprocs_num, /* Input - no of processors */
< int nprocs_symb, /* Input - no of processors for the symbolic
< factorization */
< SuperMatrix *A, /* Input - distributed input matrix */
< int_t *perm_c, /* Input - column permutation */
< int_t *perm_r, /* Input - row permutation */
< int_t *sizes, /* Input - sizes of each node in the separator tree */
< int_t *fstVtxSep, /* Input - first vertex of each node in the tree */
< Pslu_freeable_t *Pslu_freeable, /* Output - local L and U structure,
< global to local indexing information */
< MPI_Comm *num_comm, /* Input - communicator for numerical factorization */
< MPI_Comm *symb_comm, /* Input - communicator for symbolic factorization */
< mem_usage_t *symb_mem_usage
< )
< {
<
756,762c752
< /*! \brief
< * <pre>
< * Purpose
< * =======
< * Initialize relaxation parameters and statistics variables
< * </pre>
< */
---
>
768c758,762
<
---
> /*
> * Purpose
> * =======
> * Initialize relaxation parameters and statistics variables
> */
804,814d797
< /*! \brief
< *
< * <pre>
< * Purpose
< * =======
< *
< * Computes an estimation of the number of elements in columns of L
< * and rows of U. Stores this information in cntelt_vtcs, and it will
< * be used in the right-looking symbolic factorization.
< * </pre>
< */
830c813,820
<
---
> /*
> * Purpose
> * =======
> *
> * Computes an estimation of the number of elements in columns of L
> * and rows of U. Stores this information in cntelt_vtcs, and it will
> * be used in the right-looking symbolic factorization.
> */
960,962c950,967
< /*! \brief
< *
< * <pre>
---
> static float
> symbfact_mapVtcs
> (
> int iam, /* Input -process number */
> int nprocs_num, /* Input -number of processors */
> int nprocs_symb, /* Input -number of procs for symbolic factorization */
> SuperMatrix *A, /* Input -input distributed matrix A */
> int_t *fstVtxSep, /* Input -first vertex in each separator */
> int_t *sizes, /* Input -size of each separator in the separator tree */
> Pslu_freeable_t *Pslu_freeable, /* Output -globToLoc and maxNvtcsPProc
> computed */
> vtcsInfo_symbfact_t *VInfo, /* Output -local info on vertices distribution */
> int_t *tempArray, /* Input -temp array of size n = order of the matrix */
> int_t maxSzBlk, /* Input -maximum number of vertices in a block */
> psymbfact_stat_t *PS /* Input/Output -statistics */
> )
> {
> /*
985c990
< * </pre>
---
> *
987,1004d991
< static float
< symbfact_mapVtcs
< (
< int iam, /* Input -process number */
< int nprocs_num, /* Input -number of processors */
< int nprocs_symb, /* Input -number of procs for symbolic factorization */
< SuperMatrix *A, /* Input -input distributed matrix A */
< int_t *fstVtxSep, /* Input -first vertex in each separator */
< int_t *sizes, /* Input -size of each separator in the separator tree */
< Pslu_freeable_t *Pslu_freeable, /* Output -globToLoc and maxNvtcsPProc
< computed */
< vtcsInfo_symbfact_t *VInfo, /* Output -local info on vertices distribution */
< int_t *tempArray, /* Input -temp array of size n = order of the matrix */
< int_t maxSzBlk, /* Input -maximum number of vertices in a block */
< psymbfact_stat_t *PS /* Input/Output -statistics */
< )
< {
<
1199,1210d1185
< /*! \brief
< *
< * <pre>
< * Purpose
< * =======
< *
< * Distribute input matrix A for the symbolic factorization routine.
< * Only structural information is distributed. The redistributed
< * matrix has its rows and columns permuted according to perm_r and
< * perm_c. A is not modified during this routine.
< * </pre>
< */
1231c1206,1215
<
---
> /*
> * Purpose
> * =======
> *
> * Distribute input matrix A for the symbolic factorization routine.
> * Only structural information is distributed. The redistributed
> * matrix has its rows and columns permuted according to perm_r and
> * perm_c. A is not modified during this routine.
> *
> */
1466a1451,1452
> intBuf1[iam]=0; /* This corresponds to nnzToSend[iam] */
> intBuf3[iam]=0; /* This corresponds to nnzToRecv[iam] */
1469a1456,1458
> i = nnzToRecv[iam];
> nnzToRecv[iam] = 0;
> nnzToSend[iam] = 0;
1472,1475d1460
< i = nnzToRecv[iam]; /* This corresponds to nnzToRecv[iam] */
< intBuf3[iam] = 0;
< intBuf1[iam] = 0;
<
1478a1464
>
1481c1467
< #endif
---
> #else /* Default */
1482a1469
> #endif
1603,1613d1589
< /*! \brief
< *
< * <pre>
< * Allocate storage for data structures necessary for pruned graphs.
< * For those unpredictable size, make a guess as FILL * n.
< * Return value:
< * 0 if enough memory was available;
< * otherwise, return the amount of space intended to allocate
< * when memory allocation failure occurred.
< * </pre>
< */
1623c1599,1606
<
---
> /*
> * Allocate storage for data structures necessary for pruned graphs.
> * For those unpredictable size, make a guess as FILL * n.
> * Return value:
> * 0 if enough memory was available;
> * otherwise, return the amount of space intended to allocate
> * when memory allocation failure occurred.
> */
1706,1716d1688
< /*! \brief
< *
< * <pre>
< * Allocate storage for data structures necessary for pruned graphs.
< * For those unpredictable size, make a guess as FILL * n.
< * Return value:
< * 0 if enough memory was available;
< * otherwise, return the amount of space intended to allocate
< * when memory allocation failure occurred.
< * </pre>
< */
1728c1700,1707
<
---
> /*
> * Allocate storage for data structures necessary for pruned graphs.
> * For those unpredictable size, make a guess as FILL * n.
> * Return value:
> * 0 if enough memory was available;
> * otherwise, return the amount of space intended to allocate
> * when memory allocation failure occurred.
> */
1796,1806d1774
< /*! \brief
< *
< * <pre>
< * Allocate storage for the data structures common to symbolic factorization
< * routines. For those unpredictable size, make a guess as FILL * nnz(A).
< * Return value:
< * 0 if enough memory was available;
< * otherwise, return the amount of space intended to allocate
< * when memory allocation failure occurred.
< * </pre>
< */
1821c1789,1796
<
---
> /*
> * Allocate storage for the data structures common to symbolic factorization
> * routines. For those unpredictable size, make a guess as FILL * nnz(A).
> * Return value:
> * 0 if enough memory was available;
> * otherwise, return the amount of space intended to allocate
> * when memory allocation failure occurred.
> */
1990d1964
<
2846,2848c2820
< /*! \brief
< *
< * <pre>
---
> /*
2854d2825
< * </pre>
4013,4016c3984
< /*! \brief
<
< <pre>
< all processors affected to current node must call this routine
---
> /* all processors affected to current node must call this routine
4019,4021c3987,3988
< MPI_allreduce among all processors affected to curent node
< </pre>
< */
---
> MPI_allreduce among all processors affected to curent node */
>
/trunk/OTHER/SuperLU_DIST_2.5/SRC/memory.patch
New file
0,0 → 1,10
118d117
<
144c143
< buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
---
> buf = (int_t *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(int_t));
152c151
< buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
---
> buf = (int_t *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(int_t));
/trunk/OTHER/SuperLU_DIST_2.5/SRC/dgsequ.c
New file
0,0 → 1,184
/*! @file
* \brief Computes row and column scalings
*/
 
/*
* File name: dgsequ.c
* History: Modified from LAPACK routine DGEEQU
*/
#include <math.h>
#include "superlu_ddefs.h"
 
/*! \brief
 
<pre>
Purpose
=======
 
DGSEQU_dist computes row and column scalings intended to equilibrate an
M-by-N sparse matrix A and reduce its condition number. R returns the row
scale factors and C the column scale factors, chosen to try to make
the largest element in each row and column of the matrix B with
elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
 
R(i) and C(j) are restricted to be between SMLNUM = smallest safe
number and BIGNUM = largest safe number. Use of these scaling
factors is not guaranteed to reduce the condition number of A but
works well in practice.
 
See supermatrix.h for the definition of 'SuperMatrix' structure.
Arguments
=========
 
A (input) SuperMatrix*
The matrix of dimension (A->nrow, A->ncol) whose equilibration
factors are to be computed. The type of A can be:
Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE.
R (output) double*, size A->nrow
If INFO = 0 or INFO > M, R contains the row scale factors
for A.
C (output) double*, size A->ncol
If INFO = 0, C contains the column scale factors for A.
ROWCND (output) double*
If INFO = 0 or INFO > M, ROWCND contains the ratio of the
smallest R(i) to the largest R(i). If ROWCND >= 0.1 and
AMAX is neither too large nor too small, it is not worth
scaling by R.
COLCND (output) double*
If INFO = 0, COLCND contains the ratio of the smallest
C(i) to the largest C(i). If COLCND >= 0.1, it is not
worth scaling by C.
AMAX (output) double*
Absolute value of largest matrix element. If AMAX is very
close to overflow or very close to underflow, the matrix
should be scaled.
INFO (output) int*
= 0: successful exit
< 0: if INFO = -i, the i-th argument had an illegal value
> 0: if INFO = i, and i is
<= M: the i-th row of A is exactly zero
> M: the (i-M)-th column of A is exactly zero
 
=====================================================================
</pre>
*/
 
void
dgsequ_dist(SuperMatrix *A, double *r, double *c, double *rowcnd,
double *colcnd, double *amax, int_t *info)
{
 
/* Local variables */
NCformat *Astore;
double *Aval;
int i, j, irow;
double rcmin, rcmax;
double bignum, smlnum;
extern double dlamch_(char *);
/* Test the input parameters. */
*info = 0;
if ( A->nrow < 0 || A->ncol < 0 ||
A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE )
*info = -1;
if (*info != 0) {
i = -(*info);
xerbla_("dgsequ_dist", &i);
return;
}
 
/* Quick return if possible */
if ( A->nrow == 0 || A->ncol == 0 ) {
*rowcnd = 1.;
*colcnd = 1.;
*amax = 0.;
return;
}
 
Astore = (NCformat *) A->Store;
Aval = (double *) Astore->nzval;
/* Get machine constants. */
smlnum = dlamch_("S");
bignum = 1. / smlnum;
 
/* Compute row scale factors. */
for (i = 0; i < A->nrow; ++i) r[i] = 0.;
 
/* Find the maximum element in each row. */
for (j = 0; j < A->ncol; ++j)
for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
irow = Astore->rowind[i];
r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) );
}
 
/* Find the maximum and minimum scale factors. */
rcmin = bignum;
rcmax = 0.;
for (i = 0; i < A->nrow; ++i) {
rcmax = SUPERLU_MAX(rcmax, r[i]);
rcmin = SUPERLU_MIN(rcmin, r[i]);
}
*amax = rcmax;
 
if (rcmin == 0.) {
/* Find the first zero scale factor and return an error code. */
for (i = 0; i < A->nrow; ++i)
if (r[i] == 0.) {
*info = i + 1;
return;
}
} else {
/* Invert the scale factors. */
for (i = 0; i < A->nrow; ++i)
r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum );
/* Compute ROWCND = min(R(I)) / max(R(I)) */
*rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
}
 
/* Compute column scale factors */
for (j = 0; j < A->ncol; ++j) c[j] = 0.;
 
/* Find the maximum element in each column, assuming the row
scalings computed above. */
for (j = 0; j < A->ncol; ++j)
for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
irow = Astore->rowind[i];
c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] );
}
 
/* Find the maximum and minimum scale factors. */
rcmin = bignum;
rcmax = 0.;
for (j = 0; j < A->ncol; ++j) {
rcmax = SUPERLU_MAX(rcmax, c[j]);
rcmin = SUPERLU_MIN(rcmin, c[j]);
}
 
if (rcmin == 0.) {
/* Find the first zero scale factor and return an error code. */
for (j = 0; j < A->ncol; ++j)
if ( c[j] == 0. ) {
*info = A->nrow + j + 1;
return;
}
} else {
/* Invert the scale factors. */
for (j = 0; j < A->ncol; ++j)
c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum);
/* Compute COLCND = min(C(J)) / max(C(J)) */
*colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum );
}
 
return;
 
} /* dgsequ_dist */
 
 
/trunk/OTHER/SuperLU_DIST_2.5/SRC/supermatrix.h
New file
0,0 → 1,181
/*! @file
* \brief Matrix type definitions
*/
 
#ifndef __SUPERLU_SUPERMATRIX /* allow multiple inclusions */
#define __SUPERLU_SUPERMATRIX
 
 
/********************************************
* The matrix types are defined as follows. *
********************************************/
typedef enum {
SLU_NC, /* column-wise, no supernode */
SLU_NCP, /* column-wise, column-permuted, no supernode
(The consecutive columns of nonzeros, after permutation,
may not be stored contiguously.) */
SLU_NR, /* row-wize, no supernode */
SLU_SC, /* column-wise, supernode */
SLU_SCP, /* supernode, column-wise, permuted */
SLU_SR, /* row-wise, supernode */
SLU_DN, /* Fortran style column-wise storage for dense matrix */
SLU_NR_loc /* distributed compressed row format */
} Stype_t;
 
typedef enum {
SLU_S, /* single */
SLU_D, /* double */
SLU_C, /* single complex */
SLU_Z /* double complex */
} Dtype_t;
 
typedef enum {
SLU_GE, /* general */
SLU_TRLU, /* lower triangular, unit diagonal */
SLU_TRUU, /* upper triangular, unit diagonal */
SLU_TRL, /* lower triangular */
SLU_TRU, /* upper triangular */
SLU_SYL, /* symmetric, store lower half */
SLU_SYU, /* symmetric, store upper half */
SLU_HEL, /* Hermitian, store lower half */
SLU_HEU /* Hermitian, store upper half */
} Mtype_t;
 
typedef struct {
Stype_t Stype; /* Storage type: interprets the storage structure
pointed to by *Store. */
Dtype_t Dtype; /* Data type. */
Mtype_t Mtype; /* Matrix type: describes the mathematical property of
the matrix. */
int_t nrow; /* number of rows */
int_t ncol; /* number of columns */
void *Store; /* pointer to the actual storage of the matrix */
} SuperMatrix;
 
/***********************************************
* The storage schemes are defined as follows. *
***********************************************/
 
/* Stype == SLU_NC (Also known as Harwell-Boeing sparse matrix format) */
typedef struct {
int_t nnz; /* number of nonzeros in the matrix */
void *nzval; /* pointer to array of nonzero values, packed by column */
int_t *rowind; /* pointer to array of row indices of the nonzeros */
int_t *colptr; /* pointer to array of beginning of columns in nzval[]
and rowind[] */
/* Note:
Zero-based indexing is used;
colptr[] has ncol+1 entries, the last one pointing
beyond the last column, so that colptr[ncol] = nnz. */
} NCformat;
 
/* Stype == SLU_NR */
typedef struct {
int_t nnz; /* number of nonzeros in the matrix */
void *nzval; /* pointer to array of nonzero values, packed by raw */
int_t *colind; /* pointer to array of columns indices of the nonzeros */
int_t *rowptr; /* pointer to array of beginning of rows in nzval[]
and colind[] */
/* Note:
Zero-based indexing is used;
rowptr[] has nrow+1 entries, the last one pointing
beyond the last row, so that rowptr[nrow] = nnz. */
} NRformat;
 
/* Stype == SLU_SC */
typedef struct {
int_t nnz; /* number of nonzeros in the matrix */
int_t nsuper; /* number of supernodes, minus 1 */
void *nzval; /* pointer to array of nonzero values, packed by column */
int_t *nzval_colptr;/* pointer to array of beginning of columns in nzval[] */
int_t *rowind; /* pointer to array of compressed row indices of
rectangular supernodes */
int_t *rowind_colptr;/* pointer to array of beginning of columns in rowind[] */
int_t *col_to_sup; /* col_to_sup[j] is the supernode number to which column
j belongs; mapping from column to supernode number. */
int_t *sup_to_col; /* sup_to_col[s] points to the start of the s-th
supernode; mapping from supernode number to column.
e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12)
sup_to_col: 0 1 2 4 7 12 (nsuper=4) */
/* Note:
Zero-based indexing is used;
nzval_colptr[], rowind_colptr[], col_to_sup and
sup_to_col[] have ncol+1 entries, the last one
pointing beyond the last column.
For col_to_sup[], only the first ncol entries are
defined. For sup_to_col[], only the first nsuper+2
entries are defined. */
} SCformat;
 
/* Stype == SLU_SCP */
typedef struct {
int_t nnz; /* number of nonzeros in the matrix */
int_t nsuper; /* number of supernodes */
void *nzval; /* pointer to array of nonzero values, packed by column */
int_t *nzval_colbeg;/* nzval_colbeg[j] points to beginning of column j
in nzval[] */
int_t *nzval_colend;/* nzval_colend[j] points to one past the last element
of column j in nzval[] */
int_t *rowind; /* pointer to array of compressed row indices of
rectangular supernodes */
int_t *rowind_colbeg;/* rowind_colbeg[j] points to beginning of column j
in rowind[] */
int_t *rowind_colend;/* rowind_colend[j] points to one past the last element
of column j in rowind[] */
int_t *col_to_sup; /* col_to_sup[j] is the supernode number to which column
j belongs; mapping from column to supernode. */
int_t *sup_to_colbeg; /* sup_to_colbeg[s] points to the start of the s-th
supernode; mapping from supernode to column.*/
int_t *sup_to_colend; /* sup_to_colend[s] points to one past the end of the
s-th supernode; mapping from supernode number to
column.
e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12)
sup_to_colbeg: 0 1 2 4 7 (nsuper=4)
sup_to_colend: 1 2 4 7 12 */
/* Note:
Zero-based indexing is used;
nzval_colptr[], rowind_colptr[], col_to_sup and
sup_to_col[] have ncol+1 entries, the last one
pointing beyond the last column. */
} SCPformat;
 
/* Stype == SLU_NCP */
typedef struct {
int_t nnz; /* number of nonzeros in the matrix */
void *nzval; /* pointer to array of nonzero values, packed by column */
int_t *rowind;/* pointer to array of row indices of the nonzeros */
/* Note: nzval[]/rowind[] always have the same length */
int_t *colbeg;/* colbeg[j] points to the beginning of column j in nzval[]
and rowind[] */
int_t *colend;/* colend[j] points to one past the last element of column
j in nzval[] and rowind[] */
/* Note:
Zero-based indexing is used;
The consecutive columns of the nonzeros may not be
contiguous in storage, because the matrix has been
postmultiplied by a column permutation matrix. */
} NCPformat;
 
/* Stype == SLU_DN */
typedef struct {
int_t lda; /* leading dimension */
void *nzval; /* array of size lda*ncol to represent a dense matrix */
} DNformat;
 
/* Stype == SLU_NR_loc (Distributed Compressed Row Format) */
typedef struct {
int_t nnz_loc; /* number of nonzeros in the local submatrix */
int_t m_loc; /* number of rows local to this processor */
int_t fst_row; /* global index of the first row */
void *nzval; /* pointer to array of nonzero values, packed by row */
int_t *rowptr; /* pointer to array of beginning of rows in nzval[]
and colind[] */
int_t *colind; /* pointer to array of column indices of the nonzeros */
/* Note:
Zero-based indexing is used;
rowptr[] has n_loc + 1 entries, the last one pointing
beyond the last row, so that rowptr[n_loc] = nnz_loc.*/
} NRformat_loc;
 
 
#endif /* __SUPERLU_SUPERMATRIX */
/trunk/OTHER/SuperLU_DIST_2.5/SRC/GetDiagU.c
New file
0,0 → 1,107
/*! @file
* \brief Extracts the main diagonal of matrix U
*
* <pre>
* -- Auxiliary routine in distributed SuperLU (version 1.0) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* Xiaoye S. Li
* April 16, 2002
* </pre>
*/
 
#include "superlu_ddefs.h"
 
/*! \brief
*
* <pre>
* Purpose
* =======
*
* GetDiagU extracts the main diagonal of matrix U of the LU factorization.
*
* Arguments
* =========
*
* n (input) int
* Dimension of the matrix.
*
* LUstruct (input) LUstruct_t*
* The data structures to store the distributed L and U factors.
* see superlu_ddefs.h for its definition.
*
* grid (input) gridinfo_t*
* The 2D process mesh. It contains the MPI communicator, the number
* of process rows (NPROW), the number of process columns (NPCOL),
* and my process rank. It is an input argument to all the
* parallel routines.
*
* diagU (output) double*, dimension (n)
* The main diagonal of matrix U.
* On exit, it is available on all processes.
*
*
* Note
* ====
*
* The diagonal blocks of the L and U matrices are stored in the L
* data structures, and are on the diagonal processes of the
* 2D process grid.
*
* This routine is modified from gather_diag_to_all() in pdgstrs_Bglobal.c.
* </pre>
*/
void GetDiagU(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, double *diagU)
{
 
int_t *xsup;
int iam, knsupc, pkk;
int nsupr; /* number of rows in the block L(:,k) (LDA) */
int_t i, j, jj, k, lk, lwork, nsupers, p;
int_t num_diag_procs, *diag_procs, *diag_len;
Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
LocalLU_t *Llu = LUstruct->Llu;
double *dblock, *dwork, *lusup;
 
iam = grid->iam;
nsupers = Glu_persist->supno[n-1] + 1;
xsup = Glu_persist->xsup;
 
get_diag_procs(n, Glu_persist, grid, &num_diag_procs,
&diag_procs, &diag_len);
jj = diag_len[0];
for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] );
if ( !(dwork = doubleMalloc_dist(jj)) ) ABORT("Malloc fails for dwork[]");
 
for (p = 0; p < num_diag_procs; ++p) {
pkk = diag_procs[p];
if ( iam == pkk ) {
/* Copy diagonal into buffer dwork[]. */
lwork = 0;
for (k = p; k < nsupers; k += num_diag_procs) {
knsupc = SuperSize( k );
lk = LBj( k, grid );
nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */
lusup = Llu->Lnzval_bc_ptr[lk];
for (i = 0; i < knsupc; ++i) /* Copy the diagonal. */
dwork[lwork+i] = lusup[i*(nsupr+1)];
lwork += knsupc;
}
MPI_Bcast( dwork, lwork, MPI_DOUBLE, pkk, grid->comm );
} else {
MPI_Bcast( dwork, diag_len[p], MPI_DOUBLE, pkk, grid->comm );
}
 
/* Scatter dwork[] into global diagU vector. */
lwork = 0;
for (k = p; k < nsupers; k += num_diag_procs) {
knsupc = SuperSize( k );
dblock = &diagU[FstBlockC( k )];
for (i = 0; i < knsupc; ++i) dblock[i] = dwork[lwork+i];
lwork += knsupc;
}
} /* for p = ... */
 
SUPERLU_FREE(diag_procs);
SUPERLU_FREE(diag_len);
SUPERLU_FREE(dwork);
}
/trunk/OTHER/SuperLU_DIST_2.5/SRC/Makefile
New file
0,0 → 1,102
#######################################################################
#
# This makefile creates a library for distributed SuperLU.
# The files are organized as follows:
#
# ALLAUX -- Auxiliary routines called from all precisions
# DSLUSRC -- Double precision real serial SuperLU routines
# DPLUSRC -- Double precision real parallel SuperLU routines
# ZSLUSRC -- Double precision complex serial SuperLU routines
# ZPLUSRC -- Double precision complex parallel SuperLU routines
#
# The library can be set up to include routines for any combination
# of the two precisions. To create or add to the library, enter make
# followed by one or more of the precisions desired. Some examples:
# make double
# make double complex16
# Alternatively, the command
# make
# without any arguments creates a library of all two precisions.
# The library is called
# superlu.a
# and is created at the next higher directory level.
#
# To remove the object files after the library is created, enter
# make clean
#
#######################################################################
#include ../make.inc
 
include ../../../Makefile.def
DSUPERLULIB = $(DISTRIBUTED_SUPERLU_LIBRARY)
#DSUPERLULIB = a.lib
ARCH = $(AR)
ARCHFLAGS = $(ARFLAGS)
 
#
# Precision independent routines
#
ALLAUX = sp_ienv.o etree.o sp_colorder.o get_perm_c.o \
mmd.o comm.o memory.o util.o superlu_grid.o \
pxerbla.o superlu_timer.o GetDiagU.o mc64ad.o symbfact.o \
psymbfact.o psymbfact_util.o get_perm_c_parmetis.o
 
#### LAPACK auxiliary routines
LAAUX = lsame.o xerbla.o slamch.o dlamch.o
 
#
# Routines literally taken from SuperLU
#
DSLUSRC = dlangs.o dgsequ.o dlaqgs.o dutil.o \
dmemory.o dmyblas2.o dsp_blas2.o dsp_blas3.o
ZSLUSRC = dcomplex.o zlangs.o zgsequ.o zlaqgs.o zutil.o \
zmemory.o zmyblas2.o dmemory.o zsp_blas2.o zsp_blas3.o
 
#
# Routines for double precision parallel SuperLU
# DPLUSRC = dldperm.o ddistribute.o pdgstrf.o pdgstrs_Bglobal.o
DPLUSRC = dldperm.o ddistribute.o pdgstrf.o pdgstrs_Bglobal.o \
pdgstrs1.o pdgssvx_ABglobal.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o\
pdgssvx.o pdgstrs.o pddistribute.o pdlangs.o pdutil.o \
pdgsequ.o pdlaqgs.o pdgsrfs.o pdgsmv.o pdgstrs_lsum.o \
pdsymbfact_distdata.o
 
#
# Routines for double complex parallel SuperLU
# ZPLUSRC = zldperm.o zdistribute.o pzgssvx_ABglobal.o pzgstrf_irecv.o
ZPLUSRC = zldperm.o zdistribute.o pzgssvx_ABglobal.o pzgstrf.o \
pzgstrs1.o pzgstrs_Bglobal.o pzgsrfs_ABXglobal.o pzgsmv_AXglobal.o \
pzgssvx.o pzgstrs.o pzdistribute.o pzlangs.o pzutil.o \
pzgsequ.o pzlaqgs.o pzgsrfs.o pzgsmv.o pzgstrs_lsum.o \
pzsymbfact_distdata.o
 
all: double
 
double: $(DSLUSRC) $(DPLUSRC) $(ALLAUX) $(LAAUX)
$(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \
$(DSLUSRC) $(DPLUSRC) $(ALLAUX) $(LAAUX)
$(RANLIB) $(DSUPERLULIB)
 
complex16: $(ZSLUSRC) $(ZPLUSRC) $(ALLAUX) $(LAAUX)
$(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \
$(ZSLUSRC) $(ZPLUSRC) $(ALLAUX) $(LAAUX)
$(RANLIB) $(DSUPERLULIB)
 
 
##################################
# Do not optimize these routines #
##################################
slamch.o: slamch.c ; $(CC) $(NOOPTS) $(CDEFS) -c $<
dlamch.o: dlamch.c ; $(CC) $(NOOPTS) $(CDEFS) -c $<
##################################
 
.c.o:
$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c $< $(VERBOSE)
 
.f.o:
$(FORTRAN) $(FFLAGS) -c $< $(VERBOSE)
 
clean:
rm -f *.o $(DSUPERLULIB)
 
 
/trunk/OTHER/SuperLU_DIST_2.5/SRC/machines.h
New file
0,0 → 1,53
/*! @file
* \brief These macros define which machine will be used
*
* <pre>
* -- SuperLU MT routine (version 1.0) --
* Univ. of California Berkeley, Xerox Palo Alto Research Center,
* and Lawrence Berkeley National Lab.
* August 15, 1997
*
* These macros define which machine will be used.
* </pre>
*/
 
#ifndef __SUPERLU_MACHINES /* allow multiple inclusions */
#define __SUPERLU_MACHINES
 
#define SGI 0
#define ORIGIN 1
#define DEC 2
#define CRAY_T3E 3
#define SUN 4
#define PTHREAD 5
#define IBM 6
 
#ifdef _SGI
#define MACH SGI
#endif
 
#ifdef _ORIGIN
#define MACH ORIGIN
#endif
 
#ifdef _DEC
#define MACH DEC
#endif
 
#ifdef _CRAY
#define MACH CRAY_T3E
#endif
 
#ifdef _SOLARIS
#define MACH SUN
#endif
 
#ifdef _PTHREAD
#define MACH PTHREAD
#endif
 
#if ( defined(_SP2) || defined(_SP) )
#define MACH IBM
#endif
 
#endif /* __SUPERLU_MACHINES */
/trunk/OTHER/SuperLU_DIST_2.5/SRC/pdgsmv.c
New file
0,0 → 1,373
 
 
/*! @file
* \brief
*
* <pre>
* -- Distributed SuperLU routine (version 2.0) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* March 15, 2003
* </pre>
*/
 
#include <math.h>
#include "superlu_ddefs.h"
 
void pdgsmv_init
(
SuperMatrix *A, /* Matrix A permuted by columns (input/output).
The type of A can be:
Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. */
int_t *row_to_proc, /* Input. Mapping between rows and processes. */
gridinfo_t *grid, /* Input */
pdgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */
)
{
NRformat_loc *Astore;
int iam, p, procs;
int *SendCounts, *RecvCounts;
int_t i, j, k, l, m, m_loc, n, fst_row, jcol;
int_t TotalIndSend, TotalValSend;
int_t *colind, *rowptr;
int_t *ind_tosend = NULL, *ind_torecv = NULL;
int_t *ptr_ind_tosend, *ptr_ind_torecv;
int_t *extern_start, *spa, *itemp;
double *nzval, *val_tosend = NULL, *val_torecv = NULL, t;
MPI_Request *send_req, *recv_req;
MPI_Status status;
 
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(grid->iam, "Enter pdgsmv_init()");
#endif
 
/* ------------------------------------------------------------
INITIALIZATION.
------------------------------------------------------------*/
iam = grid->iam;
procs = grid->nprow * grid->npcol;
Astore = (NRformat_loc *) A->Store;
m = A->nrow;
n = A->ncol;
m_loc = Astore->m_loc;
fst_row = Astore->fst_row;
colind = Astore->colind;
rowptr = Astore->rowptr;
nzval = Astore->nzval;
if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) )
ABORT("Malloc fails for SendCounts[]");
/*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/
RecvCounts = SendCounts + procs;
if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) )
ABORT("Malloc fails for ptr_ind_tosend[]");
ptr_ind_torecv = ptr_ind_tosend + procs + 1;
if ( !(extern_start = intMalloc_dist(m_loc)) )
ABORT("Malloc fails for extern_start[]");
for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i];
 
/* ------------------------------------------------------------
COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS.
THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS.
SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE
LOCAL PART OF X.
THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A.
------------------------------------------------------------*/
if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */
ABORT("Malloc fails for spa[]");
for (p = 0; p < procs; ++p) SendCounts[p] = 0;
for (i = 0; i < m_loc; ++i) { /* Loop through each row */
k = extern_start[i];
for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */
jcol = colind[j];
p = row_to_proc[jcol];
if ( p != iam ) { /* External */
if ( spa[jcol] == 0 ) { /* First time see this index */
++SendCounts[p];
spa[jcol] = 1;
}
} else { /* Swap to beginning the part of A corresponding
to the local part of X */
l = colind[k];
t = nzval[k];
colind[k] = jcol;
nzval[k] = nzval[j];
colind[j] = l;
nzval[j] = t;
++k;
}
}
extern_start[i] = k;
}
 
/* ------------------------------------------------------------
LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES.
THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A.
------------------------------------------------------------*/
/* Build pointers to ind_tosend[]. */
ptr_ind_tosend[0] = 0;
for (p = 0, TotalIndSend = 0; p < procs; ++p) {
TotalIndSend += SendCounts[p]; /* Total to send. */
ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p];
}
#if 0
ptr_ind_tosend[iam] = 0; /* Local part of X */
#endif
if ( TotalIndSend ) {
if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) )
ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */
}
 
/* Build SPA to aid global to local translation. */
for (i = 0; i < n; ++i) spa[i] = EMPTY;
for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */
for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
jcol = colind[j];
if ( spa[jcol] == EMPTY ) { /* First time see this index */
p = row_to_proc[jcol];
if ( p == iam ) { /* Local */
/*assert(jcol>=fst_row);*/
spa[jcol] = jcol - fst_row; /* Relative position in local X */
} else { /* External */
ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */
spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */
++ptr_ind_tosend[p];
}
}
}
}
/* ------------------------------------------------------------
TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES.
THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A.
------------------------------------------------------------*/
for (i = 0; i < m_loc; ++i) {
for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
jcol = colind[j];
colind[j] = spa[jcol];
}
}
 
/* ------------------------------------------------------------
COMMUNICATE THE EXTERNAL INDICES OF X.
------------------------------------------------------------*/
MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT,
grid->comm);
 
/* Build pointers to ind_torecv[]. */
ptr_ind_torecv[0] = 0;
for (p = 0, TotalValSend = 0; p < procs; ++p) {
TotalValSend += RecvCounts[p]; /* Total to receive. */
ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p];
}
if ( TotalValSend ) {
if ( !(ind_torecv = intMalloc_dist(TotalValSend)) )
ABORT("Malloc fails for ind_torecv[]");
}
 
if ( !(send_req = (MPI_Request *)
SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))))
ABORT("Malloc fails for recv_req[].");
recv_req = send_req + procs;
for (p = 0; p < procs; ++p) {
ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */
if ( SendCounts[p] ) {
MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p],
mpi_int_t, p, iam, grid->comm, &send_req[p]);
}
if ( RecvCounts[p] ) {
MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p],
mpi_int_t, p, p, grid->comm, &recv_req[p]);
}
}
for (p = 0; p < procs; ++p) {
if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status);