| /trunk/OTHER/SuperLU_DIST_2.5/SRC/pddistribute.c |
|---|
| New file |
| 0,0 → 1,1058 |
| /*! @file |
| * \brief Re-distribute A on the 2D process mesh. |
| * <pre> |
| * -- Distributed SuperLU routine (version 2.3) -- |
| * Lawrence Berkeley National Lab, Univ. of California Berkeley. |
| * October 15, 2008 |
| * </pre> |
| */ |
| #include "superlu_ddefs.h" |
| /*! \brief |
| * |
| * <pre> |
| * Purpose |
| * ======= |
| * Re-distribute A on the 2D process mesh. |
| * |
| * Arguments |
| * ========= |
| * |
| * A (input) SuperMatrix* |
| * The distributed input matrix A of dimension (A->nrow, A->ncol). |
| * A may be overwritten by diag(R)*A*diag(C)*Pc^T. |
| * The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. |
| * |
| * ScalePermstruct (input) ScalePermstruct_t* |
| * The data structure to store the scaling and permutation vectors |
| * describing the transformations performed to the original matrix A. |
| * |
| * Glu_freeable (input) *Glu_freeable_t |
| * The global structure describing the graph of L and U. |
| * |
| * grid (input) gridinfo_t* |
| * The 2D process mesh. |
| * |
| * colptr (output) int* |
| * |
| * rowind (output) int* |
| * |
| * a (output) double* |
| * |
| * Return value |
| * ============ |
| * </pre> |
| */ |
| int_t |
| dReDistribute_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, |
| Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno, |
| gridinfo_t *grid, int_t *colptr[], int_t *rowind[], |
| double *a[]) |
| { |
| NRformat_loc *Astore; |
| int_t *perm_r; /* row permutation vector */ |
| int_t *perm_c; /* column permutation vector */ |
| int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize; |
| int_t nnz_loc; /* number of local nonzeros */ |
| int_t nnz_remote; /* number of remote nonzeros to be sent */ |
| int_t SendCnt; /* number of remote nonzeros to be sent */ |
| int_t RecvCnt; /* number of remote nonzeros to be sent */ |
| int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; |
| int_t *ia, *ja, **ia_send, *index, *itemp; |
| int_t *ptr_to_send; |
| double *aij, **aij_send, *nzval, *dtemp; |
| double *nzval_a; |
| int iam, it, p, procs; |
| MPI_Request *send_req; |
| MPI_Status status; |
| /* ------------------------------------------------------------ |
| INITIALIZATION. |
| ------------------------------------------------------------*/ |
| iam = grid->iam; |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Enter dReDistribute_A()"); |
| #endif |
| perm_r = ScalePermstruct->perm_r; |
| perm_c = ScalePermstruct->perm_c; |
| procs = grid->nprow * grid->npcol; |
| Astore = (NRformat_loc *) A->Store; |
| n = A->ncol; |
| m_loc = Astore->m_loc; |
| fst_row = Astore->fst_row; |
| nnzToRecv = intCalloc_dist(2*procs); |
| nnzToSend = nnzToRecv + procs; |
| /* ------------------------------------------------------------ |
| COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, |
| THEN ALLOCATE SPACE. |
| THIS ACCOUNTS FOR THE FIRST PASS OF A. |
| ------------------------------------------------------------*/ |
| for (i = 0; i < m_loc; ++i) { |
| for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { |
| irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ |
| jcol = Astore->colind[j]; |
| gbi = BlockNum( irow ); |
| gbj = BlockNum( jcol ); |
| p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); |
| ++nnzToSend[p]; |
| } |
| } |
| /* All-to-all communication */ |
| MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, |
| grid->comm); |
| maxnnzToRecv = 0; |
| nnz_loc = SendCnt = RecvCnt = 0; |
| for (p = 0; p < procs; ++p) { |
| if ( p != iam ) { |
| SendCnt += nnzToSend[p]; |
| RecvCnt += nnzToRecv[p]; |
| maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); |
| } else { |
| nnz_loc += nnzToRecv[p]; |
| /*assert(nnzToSend[p] == nnzToRecv[p]);*/ |
| } |
| } |
| k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ |
| /* Allocate space for storing the triplets after redistribution. */ |
| if ( k ) { /* count can be zero. */ |
| if ( !(ia = intMalloc_dist(2*k)) ) |
| ABORT("Malloc fails for ia[]."); |
| if ( !(aij = doubleMalloc_dist(k)) ) |
| ABORT("Malloc fails for aij[]."); |
| } |
| ja = ia + k; |
| /* Allocate temporary storage for sending/receiving the A triplets. */ |
| if ( procs > 1 ) { |
| if ( !(send_req = (MPI_Request *) |
| SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) |
| ABORT("Malloc fails for send_req[]."); |
| if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) |
| ABORT("Malloc fails for ia_send[]."); |
| if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) |
| ABORT("Malloc fails for aij_send[]."); |
| if ( SendCnt ) { /* count can be zero */ |
| if ( !(index = intMalloc_dist(2*SendCnt)) ) |
| ABORT("Malloc fails for index[]."); |
| if ( !(nzval = doubleMalloc_dist(SendCnt)) ) |
| ABORT("Malloc fails for nzval[]."); |
| } |
| if ( !(ptr_to_send = intCalloc_dist(procs)) ) |
| ABORT("Malloc fails for ptr_to_send[]."); |
| if ( maxnnzToRecv ) { /* count can be zero */ |
| if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) |
| ABORT("Malloc fails for itemp[]."); |
| if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) |
| ABORT("Malloc fails for dtemp[]."); |
| } |
| for (i = 0, j = 0, p = 0; p < procs; ++p) { |
| if ( p != iam ) { |
| ia_send[p] = &index[i]; |
| i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ |
| aij_send[p] = &nzval[j]; |
| j += nnzToSend[p]; |
| } |
| } |
| } /* if procs > 1 */ |
| if ( !(*colptr = intCalloc_dist(n+1)) ) |
| ABORT("Malloc fails for *colptr[]."); |
| /* ------------------------------------------------------------ |
| LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. |
| THIS ACCOUNTS FOR THE SECOND PASS OF A. |
| ------------------------------------------------------------*/ |
| nnz_loc = 0; /* Reset the local nonzero count. */ |
| nzval_a = Astore->nzval; |
| for (i = 0; i < m_loc; ++i) { |
| for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { |
| irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ |
| jcol = Astore->colind[j]; |
| gbi = BlockNum( irow ); |
| gbj = BlockNum( jcol ); |
| p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); |
| if ( p != iam ) { /* remote */ |
| k = ptr_to_send[p]; |
| ia_send[p][k] = irow; |
| ia_send[p][k + nnzToSend[p]] = jcol; |
| aij_send[p][k] = nzval_a[j]; |
| ++ptr_to_send[p]; |
| } else { /* local */ |
| ia[nnz_loc] = irow; |
| ja[nnz_loc] = jcol; |
| aij[nnz_loc] = nzval_a[j]; |
| ++nnz_loc; |
| ++(*colptr)[jcol]; /* Count nonzeros in each column */ |
| } |
| } |
| } |
| /* ------------------------------------------------------------ |
| PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. |
| NOTE: Can possibly use MPI_Alltoallv. |
| ------------------------------------------------------------*/ |
| for (p = 0; p < procs; ++p) { |
| if ( p != iam ) { |
| it = 2*nnzToSend[p]; |
| MPI_Isend( ia_send[p], it, mpi_int_t, |
| p, iam, grid->comm, &send_req[p] ); |
| it = nnzToSend[p]; |
| MPI_Isend( aij_send[p], it, MPI_DOUBLE, |
| p, iam+procs, grid->comm, &send_req[procs+p] ); |
| } |
| } |
| for (p = 0; p < procs; ++p) { |
| if ( p != iam ) { |
| it = 2*nnzToRecv[p]; |
| MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); |
| it = nnzToRecv[p]; |
| MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs, |
| grid->comm, &status ); |
| for (i = 0; i < nnzToRecv[p]; ++i) { |
| ia[nnz_loc] = itemp[i]; |
| jcol = itemp[i + nnzToRecv[p]]; |
| /*assert(jcol<n);*/ |
| ja[nnz_loc] = jcol; |
| aij[nnz_loc] = dtemp[i]; |
| ++nnz_loc; |
| ++(*colptr)[jcol]; /* Count nonzeros in each column */ |
| } |
| } |
| } |
| for (p = 0; p < procs; ++p) { |
| if ( p != iam ) { |
| MPI_Wait( &send_req[p], &status); |
| MPI_Wait( &send_req[procs+p], &status); |
| } |
| } |
| /* ------------------------------------------------------------ |
| DEALLOCATE TEMPORARY STORAGE |
| ------------------------------------------------------------*/ |
| SUPERLU_FREE(nnzToRecv); |
| if ( procs > 1 ) { |
| SUPERLU_FREE(send_req); |
| SUPERLU_FREE(ia_send); |
| SUPERLU_FREE(aij_send); |
| if ( SendCnt ) { |
| SUPERLU_FREE(index); |
| SUPERLU_FREE(nzval); |
| } |
| SUPERLU_FREE(ptr_to_send); |
| if ( maxnnzToRecv ) { |
| SUPERLU_FREE(itemp); |
| SUPERLU_FREE(dtemp); |
| } |
| } |
| /* ------------------------------------------------------------ |
| CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT. |
| ------------------------------------------------------------*/ |
| if ( nnz_loc ) { /* nnz_loc can be zero */ |
| if ( !(*rowind = intMalloc_dist(nnz_loc)) ) |
| ABORT("Malloc fails for *rowind[]."); |
| if ( !(*a = doubleMalloc_dist(nnz_loc)) ) |
| ABORT("Malloc fails for *a[]."); |
| } |
| /* Initialize the array of column pointers */ |
| k = 0; |
| jsize = (*colptr)[0]; |
| (*colptr)[0] = 0; |
| for (j = 1; j < n; ++j) { |
| k += jsize; |
| jsize = (*colptr)[j]; |
| (*colptr)[j] = k; |
| } |
| /* Copy the triplets into the column oriented storage */ |
| for (i = 0; i < nnz_loc; ++i) { |
| j = ja[i]; |
| k = (*colptr)[j]; |
| (*rowind)[k] = ia[i]; |
| (*a)[k] = aij[i]; |
| ++(*colptr)[j]; |
| } |
| /* Reset the column pointers to the beginning of each column */ |
| for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1]; |
| (*colptr)[0] = 0; |
| if ( nnz_loc ) { |
| SUPERLU_FREE(ia); |
| SUPERLU_FREE(aij); |
| } |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Exit dReDistribute_A()"); |
| #endif |
| } /* dReDistribute_A */ |
| float |
| pddistribute(fact_t fact, int_t n, SuperMatrix *A, |
| ScalePermstruct_t *ScalePermstruct, |
| Glu_freeable_t *Glu_freeable, LUstruct_t *LUstruct, |
| gridinfo_t *grid) |
| /* |
| * -- Distributed SuperLU routine (version 2.0) -- |
| * Lawrence Berkeley National Lab, Univ. of California Berkeley. |
| * March 15, 2003 |
| * |
| * |
| * Purpose |
| * ======= |
| * Distribute the matrix onto the 2D process mesh. |
| * |
| * Arguments |
| * ========= |
| * |
| * fact (input) fact_t |
| * Specifies whether or not the L and U structures will be re-used. |
| * = SamePattern_SameRowPerm: L and U structures are input, and |
| * unchanged on exit. |
| * = DOFACT or SamePattern: L and U structures are computed and output. |
| * |
| * n (input) int |
| * Dimension of the matrix. |
| * |
| * A (input) SuperMatrix* |
| * The distributed input matrix A of dimension (A->nrow, A->ncol). |
| * A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be: |
| * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. |
| * |
| * ScalePermstruct (input) ScalePermstruct_t* |
| * The data structure to store the scaling and permutation vectors |
| * describing the transformations performed to the original matrix A. |
| * |
| * Glu_freeable (input) *Glu_freeable_t |
| * The global structure describing the graph of L and U. |
| * |
| * LUstruct (input) LUstruct_t* |
| * Data structures for L and U factors. |
| * |
| * grid (input) gridinfo_t* |
| * The 2D process mesh. |
| * |
| * Return value |
| * ============ |
| * > 0, working storage required (in bytes). |
| * |
| */ |
| { |
| Glu_persist_t *Glu_persist = LUstruct->Glu_persist; |
| LocalLU_t *Llu = LUstruct->Llu; |
| int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, |
| len, len1, nsupc; |
| int_t ljb; /* local block column number */ |
| int_t nrbl; /* number of L blocks in current block column */ |
| int_t nrbu; /* number of U blocks in current block column */ |
| int_t gb; /* global block number; 0 < gb <= nsuper */ |
| int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ |
| int iam, jbrow, kcol, mycol, myrow, pc, pr; |
| int_t mybufmax[NBUFFERS]; |
| NRformat_loc *Astore; |
| double *a; |
| int_t *asub, *xa; |
| int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ |
| int_t *supno = Glu_persist->supno; |
| int_t *lsub, *xlsub, *usub, *xusub; |
| int_t nsupers; |
| int_t next_lind; /* next available position in index[*] */ |
| int_t next_lval; /* next available position in nzval[*] */ |
| int_t *index; /* indices consist of headers and row subscripts */ |
| double *lusup, *uval; /* nonzero values in L and U */ |
| double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ |
| int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ |
| double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ |
| int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ |
| /*-- Counts to be used in factorization. --*/ |
| int_t *ToRecv, *ToSendD, **ToSendR; |
| /*-- Counts to be used in lower triangular solve. --*/ |
| int_t *fmod; /* Modification count for L-solve. */ |
| int_t **fsendx_plist; /* Column process list to send down Xk. */ |
| int_t nfrecvx = 0; /* Number of Xk I will receive. */ |
| int_t nfsendx = 0; /* Number of Xk I will send */ |
| int_t kseen; |
| /*-- Counts to be used in upper triangular solve. --*/ |
| int_t *bmod; /* Modification count for U-solve. */ |
| int_t **bsendx_plist; /* Column process list to send down Xk. */ |
| int_t nbrecvx = 0; /* Number of Xk I will receive. */ |
| int_t nbsendx = 0; /* Number of Xk I will send */ |
| int_t *ilsum; /* starting position of each supernode in |
| the full array (local) */ |
| /*-- Auxiliary arrays; freed on return --*/ |
| int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ |
| int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ |
| int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ |
| int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ |
| int_t *Ucbs; /* number of column blocks in a block row */ |
| int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ |
| int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ |
| int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ |
| int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ |
| double *dense, *dense_col; /* SPA */ |
| double zero = 0.0; |
| int_t ldaspa; /* LDA of SPA */ |
| int_t iword, dword; |
| float mem_use = 0.0; |
| #if ( PRNTlevel>=1 ) |
| int_t nLblocks = 0, nUblocks = 0; |
| #endif |
| #if ( PROFlevel>=1 ) |
| double t, t_u, t_l; |
| int_t u_blks; |
| #endif |
| /* Initialization. */ |
| iam = grid->iam; |
| myrow = MYROW( iam, grid ); |
| mycol = MYCOL( iam, grid ); |
| for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; |
| nsupers = supno[n-1] + 1; |
| Astore = (NRformat_loc *) A->Store; |
| #if ( PRNTlevel>=1 ) |
| iword = sizeof(int_t); |
| dword = sizeof(double); |
| #endif |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Enter pddistribute()"); |
| #endif |
| #if ( PROFlevel>=1 ) |
| t = SuperLU_timer_(); |
| #endif |
| dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno, |
| grid, &xa, &asub, &a); |
| #if ( PROFlevel>=1 ) |
| t = SuperLU_timer_() - t; |
| if ( !iam ) printf("--------\n" |
| ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t); |
| #endif |
| if ( fact == SamePattern_SameRowPerm ) { |
| #if ( PROFlevel>=1 ) |
| t_l = t_u = 0; u_blks = 0; |
| #endif |
| /* We can propagate the new values of A into the existing |
| L and U data structures. */ |
| ilsum = Llu->ilsum; |
| ldaspa = Llu->ldalsum; |
| if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) |
| ABORT("Calloc fails for SPA dense[]."); |
| nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ |
| if ( !(Urb_length = intCalloc_dist(nrbu)) ) |
| ABORT("Calloc fails for Urb_length[]."); |
| if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) |
| ABORT("Malloc fails for Urb_indptr[]."); |
| Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; |
| Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; |
| Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; |
| Unzval_br_ptr = Llu->Unzval_br_ptr; |
| #if ( PRNTlevel>=1 ) |
| mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; |
| #endif |
| #if ( PROFlevel>=1 ) |
| t = SuperLU_timer_(); |
| #endif |
| /* Initialize Uval to zero. */ |
| for (lb = 0; lb < nrbu; ++lb) { |
| Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ |
| index = Ufstnz_br_ptr[lb]; |
| if ( index ) { |
| uval = Unzval_br_ptr[lb]; |
| len = index[1]; |
| for (i = 0; i < len; ++i) uval[i] = zero; |
| } /* if index != NULL */ |
| } /* for lb ... */ |
| for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ |
| pc = PCOL( jb, grid ); |
| if ( mycol == pc ) { /* Block column jb in my process column */ |
| fsupc = FstBlockC( jb ); |
| nsupc = SuperSize( jb ); |
| /* Scatter A into SPA (for L), or into U directly. */ |
| for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { |
| for (i = xa[j]; i < xa[j+1]; ++i) { |
| irow = asub[i]; |
| gb = BlockNum( irow ); |
| if ( myrow == PROW( gb, grid ) ) { |
| lb = LBi( gb, grid ); |
| if ( gb < jb ) { /* in U */ |
| index = Ufstnz_br_ptr[lb]; |
| uval = Unzval_br_ptr[lb]; |
| while ( (k = index[Urb_indptr[lb]]) < jb ) { |
| /* Skip nonzero values in this block */ |
| Urb_length[lb] += index[Urb_indptr[lb]+1]; |
| /* Move pointer to the next block */ |
| Urb_indptr[lb] += UB_DESCRIPTOR |
| + SuperSize( k ); |
| } |
| /*assert(k == jb);*/ |
| /* start fstnz */ |
| istart = Urb_indptr[lb] + UB_DESCRIPTOR; |
| len = Urb_length[lb]; |
| fsupc1 = FstBlockC( gb+1 ); |
| k = j - fsupc; |
| /* Sum the lengths of the leading columns */ |
| for (jj = 0; jj < k; ++jj) |
| len += fsupc1 - index[istart++]; |
| /*assert(irow>=index[istart]);*/ |
| uval[len + irow - index[istart]] = a[i]; |
| } else { /* in L; put in SPA first */ |
| irow = ilsum[lb] + irow - FstBlockC( gb ); |
| dense_col[irow] = a[i]; |
| } |
| } |
| } /* for i ... */ |
| dense_col += ldaspa; |
| } /* for j ... */ |
| #if ( PROFlevel>=1 ) |
| t_u += SuperLU_timer_() - t; |
| t = SuperLU_timer_(); |
| #endif |
| /* Gather the values of A from SPA into Lnzval[]. */ |
| ljb = LBj( jb, grid ); /* Local block number */ |
| index = Lrowind_bc_ptr[ljb]; |
| if ( index ) { |
| nrbl = index[0]; /* Number of row blocks. */ |
| len = index[1]; /* LDA of lusup[]. */ |
| lusup = Lnzval_bc_ptr[ljb]; |
| next_lind = BC_HEADER; |
| next_lval = 0; |
| for (jj = 0; jj < nrbl; ++jj) { |
| gb = index[next_lind++]; |
| len1 = index[next_lind++]; /* Rows in the block. */ |
| lb = LBi( gb, grid ); |
| for (bnnz = 0; bnnz < len1; ++bnnz) { |
| irow = index[next_lind++]; /* Global index. */ |
| irow = ilsum[lb] + irow - FstBlockC( gb ); |
| k = next_lval++; |
| for (j = 0, dense_col = dense; j < nsupc; ++j) { |
| lusup[k] = dense_col[irow]; |
| dense_col[irow] = zero; |
| k += len; |
| dense_col += ldaspa; |
| } |
| } /* for bnnz ... */ |
| } /* for jj ... */ |
| } /* if index ... */ |
| #if ( PROFlevel>=1 ) |
| t_l += SuperLU_timer_() - t; |
| #endif |
| } /* if mycol == pc */ |
| } /* for jb ... */ |
| SUPERLU_FREE(dense); |
| SUPERLU_FREE(Urb_length); |
| SUPERLU_FREE(Urb_indptr); |
| #if ( PROFlevel>=1 ) |
| if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", |
| t_l, t_u, u_blks, nrbu); |
| #endif |
| } else { |
| /* ------------------------------------------------------------ |
| FIRST TIME CREATING THE L AND U DATA STRUCTURES. |
| ------------------------------------------------------------*/ |
| #if ( PROFlevel>=1 ) |
| t_l = t_u = 0; u_blks = 0; |
| #endif |
| /* We first need to set up the L and U data structures and then |
| * propagate the values of A into them. |
| */ |
| lsub = Glu_freeable->lsub; /* compressed L subscripts */ |
| xlsub = Glu_freeable->xlsub; |
| usub = Glu_freeable->usub; /* compressed U subscripts */ |
| xusub = Glu_freeable->xusub; |
| if ( !(ToRecv = intCalloc_dist(nsupers)) ) |
| ABORT("Calloc fails for ToRecv[]."); |
| k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ |
| if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) |
| ABORT("Malloc fails for ToSendR[]."); |
| j = k * grid->npcol; |
| if ( !(index = intMalloc_dist(j)) ) |
| ABORT("Malloc fails for index[]."); |
| #if ( PRNTlevel>=1 ) |
| mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; |
| #endif |
| for (i = 0; i < j; ++i) index[i] = EMPTY; |
| for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; |
| k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ |
| /* Pointers to the beginning of each block row of U. */ |
| if ( !(Unzval_br_ptr = |
| (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) |
| ABORT("Malloc fails for Unzval_br_ptr[]."); |
| if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) |
| ABORT("Malloc fails for Ufstnz_br_ptr[]."); |
| if ( !(ToSendD = intCalloc_dist(k)) ) |
| ABORT("Malloc fails for ToSendD[]."); |
| if ( !(ilsum = intMalloc_dist(k+1)) ) |
| ABORT("Malloc fails for ilsum[]."); |
| /* Auxiliary arrays used to set up U block data structures. |
| They are freed on return. */ |
| if ( !(rb_marker = intCalloc_dist(k)) ) |
| ABORT("Calloc fails for rb_marker[]."); |
| if ( !(Urb_length = intCalloc_dist(k)) ) |
| ABORT("Calloc fails for Urb_length[]."); |
| if ( !(Urb_indptr = intMalloc_dist(k)) ) |
| ABORT("Malloc fails for Urb_indptr[]."); |
| if ( !(Urb_fstnz = intCalloc_dist(k)) ) |
| ABORT("Calloc fails for Urb_fstnz[]."); |
| if ( !(Ucbs = intCalloc_dist(k)) ) |
| ABORT("Calloc fails for Ucbs[]."); |
| #if ( PRNTlevel>=1 ) |
| mem_use += 2.0*k*sizeof(int_t*) + (7*k+1)*iword; |
| #endif |
| /* Compute ldaspa and ilsum[]. */ |
| ldaspa = 0; |
| ilsum[0] = 0; |
| for (gb = 0; gb < nsupers; ++gb) { |
| if ( myrow == PROW( gb, grid ) ) { |
| i = SuperSize( gb ); |
| ldaspa += i; |
| lb = LBi( gb, grid ); |
| ilsum[lb + 1] = ilsum[lb] + i; |
| } |
| } |
| #if ( PROFlevel>=1 ) |
| t = SuperLU_timer_(); |
| #endif |
| /* ------------------------------------------------------------ |
| COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. |
| THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). |
| ------------------------------------------------------------*/ |
| /* Loop through each supernode column. */ |
| for (jb = 0; jb < nsupers; ++jb) { |
| pc = PCOL( jb, grid ); |
| fsupc = FstBlockC( jb ); |
| nsupc = SuperSize( jb ); |
| /* Loop through each column in the block. */ |
| for (j = fsupc; j < fsupc + nsupc; ++j) { |
| /* usub[*] contains only "first nonzero" in each segment. */ |
| for (i = xusub[j]; i < xusub[j+1]; ++i) { |
| irow = usub[i]; /* First nonzero of the segment. */ |
| gb = BlockNum( irow ); |
| kcol = PCOL( gb, grid ); |
| ljb = LBj( gb, grid ); |
| if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; |
| pr = PROW( gb, grid ); |
| lb = LBi( gb, grid ); |
| if ( mycol == pc ) { |
| if ( myrow == pr ) { |
| ToSendD[lb] = YES; |
| /* Count nonzeros in entire block row. */ |
| Urb_length[lb] += FstBlockC( gb+1 ) - irow; |
| if (rb_marker[lb] <= jb) {/* First see the block */ |
| rb_marker[lb] = jb + 1; |
| Urb_fstnz[lb] += nsupc; |
| ++Ucbs[lb]; /* Number of column blocks |
| in block row lb. */ |
| #if ( PRNTlevel>=1 ) |
| ++nUblocks; |
| #endif |
| } |
| ToRecv[gb] = 1; |
| } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ |
| } |
| } /* for i ... */ |
| } /* for j ... */ |
| } /* for jb ... */ |
| /* Set up the initial pointers for each block row in U. */ |
| nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ |
| for (lb = 0; lb < nrbu; ++lb) { |
| len = Urb_length[lb]; |
| rb_marker[lb] = 0; /* Reset block marker. */ |
| if ( len ) { |
| /* Add room for descriptors */ |
| len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; |
| if ( !(index = intMalloc_dist(len1+1)) ) |
| ABORT("Malloc fails for Uindex[]."); |
| Ufstnz_br_ptr[lb] = index; |
| if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) |
| ABORT("Malloc fails for Unzval_br_ptr[*][]."); |
| mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); |
| mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); |
| index[0] = Ucbs[lb]; /* Number of column blocks */ |
| index[1] = len; /* Total length of nzval[] */ |
| index[2] = len1; /* Total length of index[] */ |
| index[len1] = -1; /* End marker */ |
| } else { |
| Ufstnz_br_ptr[lb] = NULL; |
| Unzval_br_ptr[lb] = NULL; |
| } |
| Urb_length[lb] = 0; /* Reset block length. */ |
| Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ |
| Urb_fstnz[lb] = BR_HEADER; |
| } /* for lb ... */ |
| SUPERLU_FREE(Ucbs); |
| #if ( PROFlevel>=1 ) |
| t = SuperLU_timer_() - t; |
| if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); |
| #endif |
| #if ( PRNTlevel>=1 ) |
| mem_use -= 2.0*k * iword; |
| #endif |
| /* Auxiliary arrays used to set up L block data structures. |
| They are freed on return. |
| k is the number of local row blocks. */ |
| if ( !(Lrb_length = intCalloc_dist(k)) ) |
| ABORT("Calloc fails for Lrb_length[]."); |
| if ( !(Lrb_number = intMalloc_dist(k)) ) |
| ABORT("Malloc fails for Lrb_number[]."); |
| if ( !(Lrb_indptr = intMalloc_dist(k)) ) |
| ABORT("Malloc fails for Lrb_indptr[]."); |
| if ( !(Lrb_valptr = intMalloc_dist(k)) ) |
| ABORT("Malloc fails for Lrb_valptr[]."); |
| if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3))) ) |
| ABORT("Calloc fails for SPA dense[]."); |
| /* These counts will be used for triangular solves. */ |
| if ( !(fmod = intCalloc_dist(k)) ) |
| ABORT("Calloc fails for fmod[]."); |
| if ( !(bmod = intCalloc_dist(k)) ) |
| ABORT("Calloc fails for bmod[]."); |
| /* ------------------------------------------------ */ |
| #if ( PRNTlevel>=1 ) |
| mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword; |
| #endif |
| k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ |
| /* Pointers to the beginning of each block column of L. */ |
| if ( !(Lnzval_bc_ptr = |
| (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) |
| ABORT("Malloc fails for Lnzval_bc_ptr[]."); |
| if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) |
| ABORT("Malloc fails for Lrowind_bc_ptr[]."); |
| Lrowind_bc_ptr[k-1] = NULL; |
| /* These lists of processes will be used for triangular solves. */ |
| if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) |
| ABORT("Malloc fails for fsendx_plist[]."); |
| len = k * grid->nprow; |
| if ( !(index = intMalloc_dist(len)) ) |
| ABORT("Malloc fails for fsendx_plist[0]"); |
| for (i = 0; i < len; ++i) index[i] = EMPTY; |
| for (i = 0, j = 0; i < k; ++i, j += grid->nprow) |
| fsendx_plist[i] = &index[j]; |
| if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) |
| ABORT("Malloc fails for bsendx_plist[]."); |
| if ( !(index = intMalloc_dist(len)) ) |
| ABORT("Malloc fails for bsendx_plist[0]"); |
| for (i = 0; i < len; ++i) index[i] = EMPTY; |
| for (i = 0, j = 0; i < k; ++i, j += grid->nprow) |
| bsendx_plist[i] = &index[j]; |
| /* -------------------------------------------------------------- */ |
| #if ( PRNTlevel>=1 ) |
| mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; |
| #endif |
| /*------------------------------------------------------------ |
| PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. |
| THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. |
| ------------------------------------------------------------*/ |
| for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ |
| pc = PCOL( jb, grid ); |
| if ( mycol == pc ) { /* Block column jb in my process column */ |
| fsupc = FstBlockC( jb ); |
| nsupc = SuperSize( jb ); |
| ljb = LBj( jb, grid ); /* Local block number */ |
| /* Scatter A into SPA. */ |
| for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { |
| for (i = xa[j]; i < xa[j+1]; ++i) { |
| irow = asub[i]; |
| gb = BlockNum( irow ); |
| if ( myrow == PROW( gb, grid ) ) { |
| lb = LBi( gb, grid ); |
| irow = ilsum[lb] + irow - FstBlockC( gb ); |
| dense_col[irow] = a[i]; |
| } |
| } |
| dense_col += ldaspa; |
| } /* for j ... */ |
| jbrow = PROW( jb, grid ); |
| /*------------------------------------------------ |
| * SET UP U BLOCKS. |
| *------------------------------------------------*/ |
| #if ( PROFlevel>=1 ) |
| t = SuperLU_timer_(); |
| #endif |
| kseen = 0; |
| dense_col = dense; |
| /* Loop through each column in the block column. */ |
| for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { |
| istart = xusub[j]; |
| /* NOTE: Only the first nonzero index of the segment |
| is stored in usub[]. */ |
| for (i = istart; i < xusub[j+1]; ++i) { |
| irow = usub[i]; /* First nonzero in the segment. */ |
| gb = BlockNum( irow ); |
| pr = PROW( gb, grid ); |
| if ( pr != jbrow && |
| myrow == jbrow && /* diag. proc. owning jb */ |
| bsendx_plist[ljb][pr] == EMPTY ) { |
| bsendx_plist[ljb][pr] = YES; |
| ++nbsendx; |
| } |
| if ( myrow == pr ) { |
| lb = LBi( gb, grid ); /* Local block number */ |
| index = Ufstnz_br_ptr[lb]; |
| uval = Unzval_br_ptr[lb]; |
| fsupc1 = FstBlockC( gb+1 ); |
| if (rb_marker[lb] <= jb) { /* First time see |
| the block */ |
| rb_marker[lb] = jb + 1; |
| Urb_indptr[lb] = Urb_fstnz[lb];; |
| index[Urb_indptr[lb]] = jb; /* Descriptor */ |
| Urb_indptr[lb] += UB_DESCRIPTOR; |
| /* Record the first location in index[] of the |
| next block */ |
| Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; |
| len = Urb_indptr[lb];/* Start fstnz in index */ |
| index[len-1] = 0; |
| for (k = 0; k < nsupc; ++k) |
| index[len+k] = fsupc1; |
| if ( gb != jb )/* Exclude diagonal block. */ |
| ++bmod[lb];/* Mod. count for back solve */ |
| if ( kseen == 0 && myrow != jbrow ) { |
| ++nbrecvx; |
| kseen = 1; |
| } |
| } else { /* Already saw the block */ |
| len = Urb_indptr[lb];/* Start fstnz in index */ |
| } |
| jj = j - fsupc; |
| index[len+jj] = irow; |
| /* Load the numerical values */ |
| k = fsupc1 - irow; /* No. of nonzeros in segment */ |
| index[len-1] += k; /* Increment block length in |
| Descriptor */ |
| irow = ilsum[lb] + irow - FstBlockC( gb ); |
| for (ii = 0; ii < k; ++ii) { |
| uval[Urb_length[lb]++] = dense_col[irow + ii]; |
| dense_col[irow + ii] = zero; |
| } |
| } /* if myrow == pr ... */ |
| } /* for i ... */ |
| dense_col += ldaspa; |
| } /* for j ... */ |
| #if ( PROFlevel>=1 ) |
| t_u += SuperLU_timer_() - t; |
| t = SuperLU_timer_(); |
| #endif |
| /*------------------------------------------------ |
| * SET UP L BLOCKS. |
| *------------------------------------------------*/ |
| /* Count number of blocks and length of each block. */ |
| nrbl = 0; |
| len = 0; /* Number of row subscripts I own. */ |
| kseen = 0; |
| istart = xlsub[fsupc]; |
| for (i = istart; i < xlsub[fsupc+1]; ++i) { |
| irow = lsub[i]; |
| gb = BlockNum( irow ); /* Global block number */ |
| pr = PROW( gb, grid ); /* Process row owning this block */ |
| if ( pr != jbrow && |
| myrow == jbrow && /* diag. proc. owning jb */ |
| fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { |
| fsendx_plist[ljb][pr] = YES; |
| ++nfsendx; |
| } |
| if ( myrow == pr ) { |
| lb = LBi( gb, grid ); /* Local block number */ |
| if (rb_marker[lb] <= jb) { /* First see this block */ |
| rb_marker[lb] = jb + 1; |
| Lrb_length[lb] = 1; |
| Lrb_number[nrbl++] = gb; |
| if ( gb != jb ) /* Exclude diagonal block. */ |
| ++fmod[lb]; /* Mod. count for forward solve */ |
| if ( kseen == 0 && myrow != jbrow ) { |
| ++nfrecvx; |
| kseen = 1; |
| } |
| #if ( PRNTlevel>=1 ) |
| ++nLblocks; |
| #endif |
| } else { |
| ++Lrb_length[lb]; |
| } |
| ++len; |
| } |
| } /* for i ... */ |
| if ( nrbl ) { /* Do not ensure the blocks are sorted! */ |
| /* Set up the initial pointers for each block in |
| index[] and nzval[]. */ |
| /* Add room for descriptors */ |
| len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; |
| if ( !(index = intMalloc_dist(len1)) ) |
| ABORT("Malloc fails for index[]"); |
| Lrowind_bc_ptr[ljb] = index; |
| if (!(Lnzval_bc_ptr[ljb] = |
| doubleMalloc_dist(len*nsupc))) { |
| fprintf(stderr, "col block %d ", jb); |
| ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); |
| } |
| mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); |
| mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); |
| mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); |
| index[0] = nrbl; /* Number of row blocks */ |
| index[1] = len; /* LDA of the nzval[] */ |
| next_lind = BC_HEADER; |
| next_lval = 0; |
| for (k = 0; k < nrbl; ++k) { |
| gb = Lrb_number[k]; |
| lb = LBi( gb, grid ); |
| len = Lrb_length[lb]; |
| Lrb_length[lb] = 0; /* Reset vector of block length */ |
| index[next_lind++] = gb; /* Descriptor */ |
| index[next_lind++] = len; |
| Lrb_indptr[lb] = next_lind; |
| Lrb_valptr[lb] = next_lval; |
| next_lind += len; |
| next_lval += len; |
| } |
| /* Propagate the compressed row subscripts to Lindex[], |
| and the initial values of A from SPA into Lnzval[]. */ |
| lusup = Lnzval_bc_ptr[ljb]; |
| len = index[1]; /* LDA of lusup[] */ |
| for (i = istart; i < xlsub[fsupc+1]; ++i) { |
| irow = lsub[i]; |
| gb = BlockNum( irow ); |
| if ( myrow == PROW( gb, grid ) ) { |
| lb = LBi( gb, grid ); |
| k = Lrb_indptr[lb]++; /* Random access a block */ |
| index[k] = irow; |
| k = Lrb_valptr[lb]++; |
| irow = ilsum[lb] + irow - FstBlockC( gb ); |
| for (j = 0, dense_col = dense; j < nsupc; ++j) { |
| lusup[k] = dense_col[irow]; |
| dense_col[irow] = zero; |
| k += len; |
| dense_col += ldaspa; |
| } |
| } |
| } /* for i ... */ |
| } else { |
| Lrowind_bc_ptr[ljb] = NULL; |
| Lnzval_bc_ptr[ljb] = NULL; |
| } /* if nrbl ... */ |
| #if ( PROFlevel>=1 ) |
| t_l += SuperLU_timer_() - t; |
| #endif |
| } /* if mycol == pc */ |
| } /* for jb ... */ |
| Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; |
| Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; |
| Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; |
| Llu->Unzval_br_ptr = Unzval_br_ptr; |
| Llu->ToRecv = ToRecv; |
| Llu->ToSendD = ToSendD; |
| Llu->ToSendR = ToSendR; |
| Llu->fmod = fmod; |
| Llu->fsendx_plist = fsendx_plist; |
| Llu->nfrecvx = nfrecvx; |
| Llu->nfsendx = nfsendx; |
| Llu->bmod = bmod; |
| Llu->bsendx_plist = bsendx_plist; |
| Llu->nbrecvx = nbrecvx; |
| Llu->nbsendx = nbsendx; |
| Llu->ilsum = ilsum; |
| Llu->ldalsum = ldaspa; |
| #if ( PRNTlevel>=1 ) |
| if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", |
| nLblocks, nUblocks); |
| #endif |
| SUPERLU_FREE(rb_marker); |
| SUPERLU_FREE(Urb_fstnz); |
| SUPERLU_FREE(Urb_length); |
| SUPERLU_FREE(Urb_indptr); |
| SUPERLU_FREE(Lrb_length); |
| SUPERLU_FREE(Lrb_number); |
| SUPERLU_FREE(Lrb_indptr); |
| SUPERLU_FREE(Lrb_valptr); |
| SUPERLU_FREE(dense); |
| /* Find the maximum buffer size. */ |
| MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, |
| MPI_MAX, grid->comm); |
| k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ |
| if ( !(Llu->mod_bit = intMalloc_dist(k)) ) |
| ABORT("Malloc fails for mod_bit[]."); |
| #if ( PROFlevel>=1 ) |
| if ( !iam ) printf(".. 1st distribute time:\n " |
| "\tL\t%.2f\n\tU\t%.2f\n" |
| "\tu_blks %d\tnrbu %d\n--------\n", |
| t_l, t_u, u_blks, nrbu); |
| #endif |
| } /* else fact != SamePattern_SameRowPerm */ |
| if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */ |
| SUPERLU_FREE(asub); |
| SUPERLU_FREE(a); |
| } |
| SUPERLU_FREE(xa); |
| #if ( DEBUGlevel>=1 ) |
| /* Memory allocated but not freed: |
| ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ |
| CHECK_MALLOC(iam, "Exit pddistribute()"); |
| #endif |
| return (mem_use); |
| } /* PDDISTRIBUTE */ |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/dreadrb.c |
|---|
| New file |
| 0,0 → 1,240 |
| /*! @file dreadrb.c |
| * \brief Read a matrix stored in Rutherford-Boeing format |
| * |
| * <pre> |
| * -- Distributed SuperLU routine (version 2.3) -- |
| * Lawrence Berkeley National Lab, Univ. of California Berkeley. |
| * July 15, 2009 |
| * </pre> |
| * |
| * Purpose |
| * ======= |
| * |
| * Read a DOUBLE PRECISION matrix stored in Rutherford-Boeing format |
| * as described below. |
| * |
| * Line 1 (A72, A8) |
| * Col. 1 - 72 Title (TITLE) |
| * Col. 73 - 80 Matrix name / identifier (MTRXID) |
| * |
| * Line 2 (I14, 3(1X, I13)) |
| * Col. 1 - 14 Total number of lines excluding header (TOTCRD) |
| * Col. 16 - 28 Number of lines for pointers (PTRCRD) |
| * Col. 30 - 42 Number of lines for row (or variable) indices (INDCRD) |
| * Col. 44 - 56 Number of lines for numerical values (VALCRD) |
| * |
| * Line 3 (A3, 11X, 4(1X, I13)) |
| * Col. 1 - 3 Matrix type (see below) (MXTYPE) |
| * Col. 15 - 28 Compressed Column: Number of rows (NROW) |
| * Elemental: Largest integer used to index variable (MVAR) |
| * Col. 30 - 42 Compressed Column: Number of columns (NCOL) |
| * Elemental: Number of element matrices (NELT) |
| * Col. 44 - 56 Compressed Column: Number of entries (NNZERO) |
| * Elemental: Number of variable indeces (NVARIX) |
| * Col. 58 - 70 Compressed Column: Unused, explicitly zero |
| * Elemental: Number of elemental matrix entries (NELTVL) |
| * |
| * Line 4 (2A16, A20) |
| * Col. 1 - 16 Fortran format for pointers (PTRFMT) |
| * Col. 17 - 32 Fortran format for row (or variable) indices (INDFMT) |
| * Col. 33 - 52 Fortran format for numerical values of coefficient matrix |
| * (VALFMT) |
| * (blank in the case of matrix patterns) |
| * |
| * The three character type field on line 3 describes the matrix type. |
| * The following table lists the permitted values for each of the three |
| * characters. As an example of the type field, RSA denotes that the matrix |
| * is real, symmetric, and assembled. |
| * |
| * First Character: |
| * R Real matrix |
| * C Complex matrix |
| * I integer matrix |
| * P Pattern only (no numerical values supplied) |
| * Q Pattern only (numerical values supplied in associated auxiliary value |
| * file) |
| * |
| * Second Character: |
| * S Symmetric |
| * U Unsymmetric |
| * H Hermitian |
| * Z Skew symmetric |
| * R Rectangular |
| * |
| * Third Character: |
| * A Compressed column form |
| * E Elemental form |
| * |
| * </pre> |
| */ |
| #include "superlu_ddefs.h" |
| /*! \brief Eat up the rest of the current line */ |
| static int dDumpLine(FILE *fp) |
| { |
| register int c; |
| while ((c = fgetc(fp)) != '\n') ; |
| return 0; |
| } |
| static int dParseIntFormat(char *buf, int *num, int *size) |
| { |
| char *tmp; |
| tmp = buf; |
| while (*tmp++ != '(') ; |
| sscanf(tmp, "%d", num); |
| while (*tmp != 'I' && *tmp != 'i') ++tmp; |
| ++tmp; |
| sscanf(tmp, "%d", size); |
| return 0; |
| } |
| static int dParseFloatFormat(char *buf, int *num, int *size) |
| { |
| char *tmp, *period; |
| tmp = buf; |
| while (*tmp++ != '(') ; |
| *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ |
| while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' |
| && *tmp != 'F' && *tmp != 'f') { |
| /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the |
| num picked up refers to P, which should be skipped. */ |
| if (*tmp=='p' || *tmp=='P') { |
| ++tmp; |
| *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ |
| } else { |
| ++tmp; |
| } |
| } |
| ++tmp; |
| period = tmp; |
| while (*period != '.' && *period != ')') ++period ; |
| *period = '\0'; |
| *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/ |
| return 0; |
| } |
| static int ReadVector(FILE *fp, int n, int *where, int perline, int persize) |
| { |
| register int i, j, item; |
| char tmp, buf[100]; |
| i = 0; |
| while (i < n) { |
| fgets(buf, 100, fp); /* read a line at a time */ |
| for (j=0; j<perline && i<n; j++) { |
| tmp = buf[(j+1)*persize]; /* save the char at that place */ |
| buf[(j+1)*persize] = 0; /* null terminate */ |
| item = atoi(&buf[j*persize]); |
| buf[(j+1)*persize] = tmp; /* recover the char at that place */ |
| where[i++] = item - 1; |
| } |
| } |
| return 0; |
| } |
| static int dReadValues(FILE *fp, int n, double *destination, int perline, |
| int persize) |
| { |
| register int i, j, k, s; |
| char tmp, buf[100]; |
| i = 0; |
| while (i < n) { |
| fgets(buf, 100, fp); /* read a line at a time */ |
| for (j=0; j<perline && i<n; j++) { |
| tmp = buf[(j+1)*persize]; /* save the char at that place */ |
| buf[(j+1)*persize] = 0; /* null terminate */ |
| s = j*persize; |
| for (k = 0; k < persize; ++k) /* No D_ format in C */ |
| if ( buf[s+k] == 'D' || buf[s+k] == 'd' ) buf[s+k] = 'E'; |
| destination[i++] = atof(&buf[s]); |
| buf[(j+1)*persize] = tmp; /* recover the char at that place */ |
| } |
| } |
| return 0; |
| } |
| void |
| dreadrb(FILE *fp, int *nrow, int *ncol, int *nonz, |
| double **nzval, int **rowind, int **colptr) |
| { |
| register int i, numer_lines = 0; |
| int tmp, colnum, colsize, rownum, rowsize, valnum, valsize; |
| char buf[100], type[4]; |
| /* Line 1 */ |
| fgets(buf, 100, fp); |
| fputs(buf, stdout); |
| /* Line 2 */ |
| for (i=0; i<4; i++) { |
| fscanf(fp, "%14c", buf); buf[14] = 0; |
| sscanf(buf, "%d", &tmp); |
| if (i == 3) numer_lines = tmp; |
| } |
| dDumpLine(fp); |
| /* Line 3 */ |
| fscanf(fp, "%3c", type); |
| fscanf(fp, "%11c", buf); /* pad */ |
| type[3] = 0; |
| #ifdef DEBUG |
| printf("Matrix type %s\n", type); |
| #endif |
| fscanf(fp, "%14c", buf); sscanf(buf, "%d", nrow); |
| fscanf(fp, "%14c", buf); sscanf(buf, "%d", ncol); |
| fscanf(fp, "%14c", buf); sscanf(buf, "%d", nonz); |
| fscanf(fp, "%14c", buf); sscanf(buf, "%d", &tmp); |
| if (tmp != 0) |
| printf("This is not an assembled matrix!\n"); |
| if (*nrow != *ncol) |
| printf("Matrix is not square.\n"); |
| dDumpLine(fp); |
| /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */ |
| dallocateA_dist(*ncol, *nonz, nzval, rowind, colptr); |
| /* Line 4: format statement */ |
| fscanf(fp, "%16c", buf); |
| dParseIntFormat(buf, &colnum, &colsize); |
| fscanf(fp, "%16c", buf); |
| dParseIntFormat(buf, &rownum, &rowsize); |
| fscanf(fp, "%20c", buf); |
| dParseFloatFormat(buf, &valnum, &valsize); |
| dDumpLine(fp); |
| #ifdef DEBUG |
| printf("%d rows, %d nonzeros\n", *nrow, *nonz); |
| printf("colnum %d, colsize %d\n", colnum, colsize); |
| printf("rownum %d, rowsize %d\n", rownum, rowsize); |
| printf("valnum %d, valsize %d\n", valnum, valsize); |
| #endif |
| ReadVector(fp, *ncol+1, *colptr, colnum, colsize); |
| ReadVector(fp, *nonz, *rowind, rownum, rowsize); |
| if ( numer_lines ) { |
| dReadValues(fp, *nonz, *nzval, valnum, valsize); |
| } |
| //sym = (type[1] == 'S' || type[1] == 's'); |
| //if ( sym ) { |
| FormFullA(*ncol, nonz, nzval, rowind, colptr); |
| //} |
| fclose(fp); |
| } |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/memory.c.orig |
|---|
| New file |
| 0,0 → 1,546 |
| /* |
| * -- Distributed SuperLU routine (version 1.0) -- |
| * Lawrence Berkeley National Lab, Univ. of California Berkeley. |
| * September 1, 1999 |
| * |
| */ |
| #include "superlu_ddefs.h" |
| #define PRNTlevel 0 |
| /* |
| * Global variables |
| */ |
| ExpHeader *expanders; /* Array of pointers to 4 types of memory */ |
| LU_stack_t stack; |
| int_t no_expand; |
| /* |
| * Prototype |
| */ |
| static int_t memory_usage(const int_t, const int_t, const int_t); |
| static void *expand(int_t *, MemType, int_t, int_t, |
| Glu_freeable_t *); |
| /* |
| * Internal prototypes |
| */ |
| void SetupSpace (void *, int_t, LU_space_t *); |
| void |
| superlu_abort_and_exit_dist(char *msg) |
| { |
| /*fprintf(stderr, msg); |
| fflush(stderr);*/ |
| printf(msg); |
| exit (-1); |
| } |
| long int superlu_malloc_total = 0; |
| #if ( DEBUGlevel>=1 ) /* Debug malloc/free. */ |
| #define PAD_FACTOR 2 |
| #define DWORD (sizeof(double)) /* Be sure it's no smaller than double. */ |
| void *superlu_malloc_dist(size_t size) |
| { |
| char *buf; |
| int iam; |
| MPI_Comm_rank(MPI_COMM_WORLD, &iam); |
| buf = (char *) malloc(size + DWORD); |
| if ( !buf ) { |
| printf("(%d) superlu_malloc fails: malloc_total %.0f MB, size %d\n", |
| iam, superlu_malloc_total*1e-6, size); |
| ABORT("superlu_malloc: out of memory"); |
| } |
| ((int_t *) buf)[0] = size; |
| #if 0 |
| superlu_malloc_total += size + DWORD; |
| #else |
| superlu_malloc_total += size; |
| #endif |
| return (void *) (buf + DWORD); |
| } |
| void superlu_free_dist(void *addr) |
| { |
| char *p = ((char *) addr) - DWORD; |
| if ( !addr ) |
| ABORT("superlu_free: tried to free NULL pointer"); |
| if ( !p ) |
| ABORT("superlu_free: tried to free NULL+DWORD pointer"); |
| { |
| int_t n = ((int_t *) p)[0]; |
| if ( !n ) |
| ABORT("superlu_free: tried to free a freed pointer"); |
| *((int_t *) p) = 0; /* Set to zero to detect duplicate free's. */ |
| #if 0 |
| superlu_malloc_total -= (n + DWORD); |
| #else |
| superlu_malloc_total -= n; |
| #endif |
| if ( superlu_malloc_total < 0 ) |
| ABORT("superlu_malloc_total went negative"); |
| /*free (addr);*/ |
| free (p); |
| } |
| } |
| #else /* The production mode. */ |
| void *superlu_malloc_dist(size_t size) |
| { |
| void *buf; |
| buf = (void *) malloc(size); |
| return (buf); |
| } |
| void superlu_free_dist(void *addr) |
| { |
| free (addr); |
| } |
| #endif /* End debug malloc/free. */ |
| static void |
| copy_mem_int(int_t howmany, void *old, void *new) |
| { |
| register int_t i; |
| int_t *iold = old; |
| int_t *inew = new; |
| for (i = 0; i < howmany; i++) inew[i] = iold[i]; |
| } |
| static void |
| user_bcopy(char *src, char *dest, int_t bytes) |
| { |
| char *s_ptr, *d_ptr; |
| s_ptr = src + bytes - 1; |
| d_ptr = dest + bytes - 1; |
| for (; d_ptr >= dest; --s_ptr, --d_ptr ) *d_ptr = *s_ptr; |
| } |
| int_t *intMalloc_dist(int_t n) |
| { |
| int_t *buf; |
| buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t)); |
| return (buf); |
| } |
| int_t *intCalloc_dist(int_t n) |
| { |
| int_t *buf; |
| register int_t i; |
| buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t)); |
| if ( buf ) |
| for (i = 0; i < n; ++i) buf[i] = 0; |
| return (buf); |
| } |
| void *user_malloc_dist(int_t bytes, int_t which_end) |
| { |
| void *buf; |
| if ( StackFull(bytes) ) return (NULL); |
| if ( which_end == HEAD ) { |
| buf = (char*) stack.array + stack.top1; |
| stack.top1 += bytes; |
| } else { |
| stack.top2 -= bytes; |
| buf = (char*) stack.array + stack.top2; |
| } |
| stack.used += bytes; |
| return buf; |
| } |
| void user_free_dist(int_t bytes, int_t which_end) |
| { |
| if ( which_end == HEAD ) { |
| stack.top1 -= bytes; |
| } else { |
| stack.top2 += bytes; |
| } |
| stack.used -= bytes; |
| } |
| /* |
| * Setup the memory model to be used for factorization. |
| * lwork = 0: use system malloc; |
| * lwork > 0: use user-supplied work[] space. |
| */ |
| void SetupSpace(void *work, int_t lwork, LU_space_t *MemModel) |
| { |
| if ( lwork == 0 ) { |
| *MemModel = SYSTEM; /* malloc/free */ |
| } else if ( lwork > 0 ) { |
| *MemModel = USER; /* user provided space */ |
| stack.used = 0; |
| stack.top1 = 0; |
| stack.top2 = (lwork/4)*4; /* must be word addressable */ |
| stack.size = stack.top2; |
| stack.array = (void *) work; |
| } |
| } |
| /* |
| * Allocate storage for the data structures common to symbolic factorization |
| * routines. For those unpredictable size, make a guess as FILL * nnz(A). |
| * Return value: |
| * If lwork = -1, return the estimated amount of space required, plus n; |
| * otherwise, return the amount of space actually allocated when |
| * memory allocation failure occurred. |
| */ |
| /************************************************************************/ |
| int_t symbfact_SubInit |
| /************************************************************************/ |
| ( |
| fact_t fact, void *work, int_t lwork, int_t m, int_t n, int_t annz, |
| Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable |
| ) |
| { |
| int_t iword; |
| int_t *xsup, *supno; |
| int_t *lsub, *xlsub; |
| int_t *usub, *xusub; |
| int_t nzlmax, nzumax; |
| int_t FILL = sp_ienv_dist(6); |
| #if ( DEBUGlevel>=1 ) |
| int iam; |
| MPI_Comm_rank( MPI_COMM_WORLD, &iam ); |
| CHECK_MALLOC(iam, "Enter symbfact_SubInit()"); |
| #endif |
| no_expand = 0; |
| iword = sizeof(int_t); |
| expanders = (ExpHeader *) SUPERLU_MALLOC( NO_MEMTYPE*sizeof(ExpHeader) ); |
| if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders"); |
| if ( fact == DOFACT || fact == SamePattern ) { |
| /* Guess for L\U factors */ |
| nzlmax = FILL * annz; |
| nzumax = FILL/2.0 * annz; |
| if ( lwork == -1 ) { |
| return ( GluIntArray(n) * iword + TempSpace(m,1) |
| + (nzlmax+nzumax)*iword + n ); |
| } else { |
| SetupSpace(work, lwork, &Glu_freeable->MemModel); |
| } |
| #if ( PRNTlevel>=1 ) |
| printf(".. symbfact_SubInit(): annz %ld, nzlmax %ld, nzumax %ld\n", |
| annz, nzlmax, nzumax); |
| #endif |
| /* Integer pointers for L\U factors */ |
| if ( Glu_freeable->MemModel == SYSTEM ) { |
| xsup = intMalloc_dist(n+1); |
| supno = intMalloc_dist(n+1); |
| xlsub = intMalloc_dist(n+1); |
| xusub = intMalloc_dist(n+1); |
| } else { |
| xsup = (int_t *)user_malloc_dist((n+1) * iword, HEAD); |
| supno = (int_t *)user_malloc_dist((n+1) * iword, HEAD); |
| xlsub = (int_t *)user_malloc_dist((n+1) * iword, HEAD); |
| xusub = (int_t *)user_malloc_dist((n+1) * iword, HEAD); |
| } |
| lsub = (int_t *) expand(&nzlmax, LSUB, 0, 0, Glu_freeable); |
| usub = (int_t *) expand(&nzumax, USUB, 0, 0, Glu_freeable); |
| while ( !lsub || !usub ) { |
| if ( Glu_freeable->MemModel == SYSTEM ) { |
| SUPERLU_FREE(lsub); |
| SUPERLU_FREE(usub); |
| } else { |
| user_free_dist((nzlmax+nzumax)*iword, HEAD); |
| } |
| nzlmax /= 2; |
| nzumax /= 2; |
| if ( nzumax < annz/2 ) { |
| printf("Not enough memory to perform factorization.\n"); |
| return (memory_usage(nzlmax, nzumax, n) + n); |
| } |
| #if ( PRNTlevel>=1 ) |
| printf(".. symbfact_SubInit() reduce size:" |
| "nzlmax %ld, nzumax %ld\n", nzlmax, nzumax); |
| fflush(stdout); |
| #endif |
| lsub = (int_t *) expand( &nzlmax, LSUB, 0, 0, Glu_freeable ); |
| usub = (int_t *) expand( &nzumax, USUB, 0, 1, Glu_freeable ); |
| } |
| Glu_persist->xsup = xsup; |
| Glu_persist->supno = supno; |
| Glu_freeable->lsub = lsub; |
| Glu_freeable->xlsub = xlsub; |
| Glu_freeable->usub = usub; |
| Glu_freeable->xusub = xusub; |
| Glu_freeable->nzlmax = nzlmax; |
| Glu_freeable->nzumax = nzumax; |
| } else { |
| /* fact == SamePattern_SameRowPerm */ |
| if ( lwork == -1 ) { |
| return ( GluIntArray(n) * iword + TempSpace(m, 1) |
| + (nzlmax+nzumax)*iword + n ); |
| } else if ( lwork == 0 ) { |
| Glu_freeable->MemModel = SYSTEM; |
| } else { |
| Glu_freeable->MemModel = USER; |
| stack.top2 = (lwork/4)*4; /* must be word-addressable */ |
| stack.size = stack.top2; |
| } |
| expanders[USUB].mem = Glu_freeable->usub; |
| expanders[LSUB].mem = Glu_freeable->lsub; |
| expanders[USUB].size = nzumax; |
| expanders[LSUB].size = nzlmax; |
| } |
| ++no_expand; |
| #if ( DEBUGlevel>=1 ) |
| /* Memory allocated but not freed: xsup, supno */ |
| CHECK_MALLOC(iam, "Exit symbfact_SubInit()"); |
| #endif |
| return 0; |
| } /* SYMBFACT_SUBINIT */ |
| /* |
| * Expand the data structures for L and U during the factorization. |
| * Return value: 0 - successful return |
| * > 0 - number of bytes allocated when run out of space |
| */ |
| /************************************************************************/ |
| int_t symbfact_SubXpand |
| /************************************************************************/ |
| ( |
| int_t n, /* total number of columns */ |
| int_t jcol, /* current column */ |
| int_t next, /* number of elements currently in the factors */ |
| MemType mem_type, /* which type of memory to expand */ |
| int_t *maxlen, /* modified - maximum length of a data structure */ |
| Glu_freeable_t *Glu_freeable /* modified - global LU data structures */ |
| ) |
| { |
| void *new_mem; |
| #if ( DEBUGlevel>=1 ) |
| printf("symbfact_SubXpand(): jcol %d, next %ld, maxlen %ld, MemType %d\n", |
| jcol, next, *maxlen, mem_type); |
| #endif |
| new_mem = expand(maxlen, mem_type, next, 0, Glu_freeable); |
| if ( !new_mem ) { |
| int_t nzlmax = Glu_freeable->nzlmax; |
| int_t nzumax = Glu_freeable->nzumax; |
| fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol); |
| return (memory_usage(nzlmax, nzumax, n) + n); |
| } |
| if ( mem_type == LSUB ) { |
| Glu_freeable->lsub = (int_t *) new_mem; |
| Glu_freeable->nzlmax = *maxlen; |
| } else if ( mem_type == USUB ) { |
| Glu_freeable->usub = (int_t *) new_mem; |
| Glu_freeable->nzumax = *maxlen; |
| } else ABORT("Tries to expand nonexisting memory type.\n"); |
| return 0; |
| } /* LUSUB_XPAND */ |
| /* |
| * Deallocate storage of the data structures common to symbolic |
| * factorization routines. |
| */ |
| /************************************************************************/ |
| int_t symbfact_SubFree(Glu_freeable_t *Glu_freeable) |
| /************************************************************************/ |
| { |
| #if ( DEBUGlevel>=1 ) |
| int iam; |
| MPI_Comm_rank( MPI_COMM_WORLD, &iam ); |
| CHECK_MALLOC(iam, "Enter symbfact_SubFree()"); |
| #endif |
| SUPERLU_FREE(expanders); |
| SUPERLU_FREE(Glu_freeable->lsub); |
| SUPERLU_FREE(Glu_freeable->xlsub); |
| SUPERLU_FREE(Glu_freeable->usub); |
| SUPERLU_FREE(Glu_freeable->xusub); |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Exit symbfact_SubFree()"); |
| #endif |
| return 0; |
| } /* SYMBFACT_SUBFREE */ |
| /* |
| * Expand the existing storage to accommodate more fill-ins. |
| */ |
| /************************************************************************/ |
| static void *expand |
| /************************************************************************/ |
| ( |
| int_t *prev_len, /* length used from previous call */ |
| MemType type, /* which part of the memory to expand */ |
| int_t len_to_copy, /* size of the memory to be copied to new store */ |
| int_t keep_prev, /* = 1: use prev_len; |
| = 0: compute new_len to expand */ |
| Glu_freeable_t *Glu_freeable /* modified - global LU data structures */ |
| ) |
| { |
| float EXPAND = 1.5; |
| float alpha; |
| void *new_mem; |
| int_t new_len, tries, lword, extra, bytes_to_copy; |
| alpha = EXPAND; |
| lword = sizeof(int_t); |
| if ( no_expand == 0 || keep_prev ) /* First time allocate requested */ |
| new_len = *prev_len; |
| else { |
| new_len = alpha * *prev_len; |
| } |
| if ( Glu_freeable->MemModel == SYSTEM ) { |
| new_mem = (void *) SUPERLU_MALLOC(new_len * lword); |
| /*new_mem = (void *) calloc(new_len, lword); */ |
| if ( no_expand != 0 ) { |
| tries = 0; |
| if ( keep_prev ) { |
| if ( !new_mem ) return (NULL); |
| } else { |
| while ( !new_mem ) { |
| if ( ++tries > 10 ) return (NULL); |
| alpha = Reduce(alpha); |
| new_len = alpha * *prev_len; |
| new_mem = (void *) SUPERLU_MALLOC(new_len * lword); |
| /* new_mem = (void *) calloc(new_len, lword); */ |
| } |
| } |
| copy_mem_int(len_to_copy, expanders[type].mem, new_mem); |
| SUPERLU_FREE (expanders[type].mem); |
| } |
| expanders[type].mem = (void *) new_mem; |
| } else { /* MemModel == USER */ |
| if ( no_expand == 0 ) { |
| new_mem = user_malloc_dist(new_len * lword, HEAD); |
| expanders[type].mem = (void *) new_mem; |
| } |
| else { |
| tries = 0; |
| extra = (new_len - *prev_len) * lword; |
| if ( keep_prev ) { |
| if ( StackFull(extra) ) return (NULL); |
| } else { |
| while ( StackFull(extra) ) { |
| if ( ++tries > 10 ) return (NULL); |
| alpha = Reduce(alpha); |
| new_len = alpha * *prev_len; |
| extra = (new_len - *prev_len) * lword; |
| } |
| } |
| if ( type != USUB ) { |
| new_mem = (void*)((char*)expanders[type + 1].mem + extra); |
| bytes_to_copy = (char*)stack.array + stack.top1 |
| - (char*)expanders[type + 1].mem; |
| user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy); |
| if ( type < USUB ) { |
| Glu_freeable->usub = expanders[USUB].mem = |
| (void*)((char*)expanders[USUB].mem + extra); |
| } |
| if ( type < LSUB ) { |
| Glu_freeable->lsub = expanders[LSUB].mem = |
| (void*)((char*)expanders[LSUB].mem + extra); |
| } |
| stack.top1 += extra; |
| stack.used += extra; |
| } /* if ... */ |
| } /* else ... */ |
| } |
| expanders[type].size = new_len; |
| *prev_len = new_len; |
| if ( no_expand ) ++no_expand; |
| return (void *) expanders[type].mem; |
| } /* EXPAND */ |
| /* |
| * mem_usage consists of the following fields: |
| * - for_lu (float) |
| * The amount of space used in bytes for the L\U data structures. |
| * - total (float) |
| * The amount of space needed in bytes to perform factorization. |
| * - expansions (int) |
| * Number of memory expansions during the LU factorization. |
| */ |
| /************************************************************************/ |
| int_t QuerySpace_dist(int_t n, int_t lsub_size, Glu_freeable_t *Glu_freeable, |
| mem_usage_t *mem_usage) |
| /************************************************************************/ |
| { |
| register int_t iword = sizeof(int_t); |
| extern int_t no_expand; |
| /* For the adjacency graphs of L and U. */ |
| /*mem_usage->for_lu = (float)( (4*n + 3) * iword + |
| Glu_freeable->xlsub[n]*iword );*/ |
| mem_usage->for_lu = (float)( (4*n + 3) * iword + |
| lsub_size * iword ); |
| mem_usage->for_lu += (float)( (n + 1) * iword + |
| Glu_freeable->xusub[n]*iword ); |
| /* Working storage to support factorization */ |
| mem_usage->total = mem_usage->for_lu + 9*n*iword; |
| mem_usage->expansions = --no_expand; |
| return 0; |
| } /* QUERYSPACE_DIST */ |
| static int_t |
| memory_usage(const int_t nzlmax, const int_t nzumax, const int_t n) |
| { |
| register int_t iword = sizeof(int_t); |
| return (10*n*iword + (nzlmax+nzumax)*iword); |
| } |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/get_perm_c_parmetis.c |
|---|
| New file |
| 0,0 → 1,905 |
| /*! @file |
| * \brief Gets matrix permutation |
| * |
| * <pre> |
| * -- Distributed symbolic factorization auxialiary routine (version 2.1) -- |
| * Lawrence Berkeley National Lab, Univ. of California Berkeley - July 2003 |
| * INRIA France - January 2004 |
| * Laura Grigori |
| * |
| * November 1, 2007 |
| * </pre> |
| */ |
| /* limits.h: the largest positive integer (INT_MAX) */ |
| #include <limits.h> |
| #include <math.h> |
| #include "superlu_ddefs.h" |
| /* |
| * Internal protypes |
| */ |
| static float |
| a_plus_at_CompRow_loc |
| (int, int_t *, int, int_t *, int_t , int_t *, int_t *, |
| int, int_t *, int_t *, int_t **, int_t **, gridinfo_t *); |
| /*! \brief |
| * |
| * <pre> |
| * Purpose |
| * ======= |
| * |
| * GET_PERM_C_PARMETIS obtains a permutation matrix Pc, by applying a |
| * graph partitioning algorithm to the symmetrized graph A+A'. The |
| * multilevel graph partitioning algorithm used is the |
| * ParMETIS_V3_NodeND routine available in the parallel graph |
| * partitioning package parMETIS. |
| * |
| * The number of independent sub-domains noDomains computed by this |
| * algorithm has to be a power of 2. Hence noDomains is the larger |
| * number power of 2 that is smaller than nprocs_i, where nprocs_i = nprow |
| * * npcol is the number of processors used in SuperLU_DIST. |
| * |
| * Arguments |
| * ========= |
| * |
| * A (input) SuperMatrix* |
| * Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number |
| * of the linear equations is A->nrow. Matrix A is distributed |
| * in NRformat_loc format. |
| * |
| * perm_r (input) int_t* |
| * Row permutation vector of size A->nrow, which defines the |
| * permutation matrix Pr; perm_r[i] = j means row i of A is in |
| * position j in Pr*A. |
| * |
| * perm_c (output) int_t* |
| * Column permutation vector of size A->ncol, which defines the |
| * permutation matrix Pc; perm_c[i] = j means column i of A is |
| * in position j in A*Pc. |
| * |
| * nprocs_i (input) int* |
| * Number of processors the input matrix is distributed on in a block |
| * row format. It corresponds to number of processors used in |
| * SuperLU_DIST. |
| * |
| * noDomains (input) int*, must be power of 2 |
| * Number of independent domains to be computed by the graph |
| * partitioning algorithm. ( noDomains <= nprocs_i ) |
| * |
| * sizes (output) int_t**, of size 2 * noDomains |
| * Returns pointer to an array containing the number of nodes |
| * for each sub-domain and each separator. Separators are stored |
| * from left to right. |
| * Memory for the array is allocated in this routine. |
| * |
| * fstVtxSep (output) int_t**, of size 2 * noDomains |
| * Returns pointer to an array containing first node for each |
| * sub-domain and each separator. |
| * Memory for the array is allocated in this routine. |
| * |
| * Return value |
| * ============ |
| * < 0, number of bytes allocated on return from the symbolic factorization. |
| * > 0, number of bytes allocated when out of memory. |
| * </pre> |
| */ |
| float |
| get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c, |
| int nprocs_i, int noDomains, |
| int_t **sizes, int_t **fstVtxSep, |
| gridinfo_t *grid, MPI_Comm *metis_comm) |
| { |
| NRformat_loc *Astore; |
| int iam, p; |
| int *b_rowptr_int, *b_colind_int, *l_sizes_int, *dist_order_int, *vtxdist_o_int; |
| int *options, numflag; |
| int_t m_loc, nnz_loc, fst_row; |
| int_t m, n, bnz, i, j; |
| int_t *rowptr, *colind, *l_fstVtxSep, *l_sizes; |
| int_t *b_rowptr, *b_colind; |
| int_t *dist_order; |
| int *recvcnts, *displs; |
| /* first row index on each processor when the matrix is distributed |
| on nprocs (vtxdist_i) or noDomains processors (vtxdist_o) */ |
| int_t *vtxdist_i, *vtxdist_o; |
| int_t szSep, k, noNodes; |
| float apat_mem_l; /* memory used during the computation of the graph of A+A' */ |
| float mem; /* Memory used during this routine */ |
| MPI_Status status; |
| /* Initialization. */ |
| MPI_Comm_rank (grid->comm, &iam); |
| n = A->ncol; |
| m = A->nrow; |
| if ( m != n ) ABORT("Matrix is not square"); |
| mem = 0.; |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Enter get_perm_c_parmetis()"); |
| #endif |
| Astore = (NRformat_loc *) A->Store; |
| nnz_loc = Astore->nnz_loc; /* number of nonzeros in the local submatrix */ |
| m_loc = Astore->m_loc; /* number of rows local to this processor */ |
| fst_row = Astore->fst_row; /* global index of the first row */ |
| rowptr = Astore->rowptr; /* pointer to rows and column indices */ |
| colind = Astore->colind; |
| #if ( PRNTlevel>=1 ) |
| if ( !iam ) printf(".. Use parMETIS ordering on A'+A with %d sub-domains.\n", |
| noDomains); |
| #endif |
| numflag = 0; |
| /* determine first row on each processor */ |
| vtxdist_i = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t)); |
| if ( !vtxdist_i ) ABORT("SUPERLU_MALLOC fails for vtxdist_i."); |
| vtxdist_o = (int_t *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int_t)); |
| if ( !vtxdist_o ) ABORT("SUPERLU_MALLOC fails for vtxdist_o."); |
| MPI_Allgather (&fst_row, 1, mpi_int_t, vtxdist_i, 1, mpi_int_t, |
| grid->comm); |
| vtxdist_i[nprocs_i] = m; |
| if (noDomains == nprocs_i) { |
| /* keep the same distribution of A */ |
| for (p = 0; p <= nprocs_i; p++) |
| vtxdist_o[p] = vtxdist_i[p]; |
| } |
| else { |
| i = n / noDomains; |
| j = n % noDomains; |
| for (k = 0, p = 0; p < noDomains; p++) { |
| vtxdist_o[p] = k; |
| k += i; |
| if (p < j) k++; |
| } |
| /* The remaining non-participating processors get the same |
| first-row-number as the last processor. */ |
| for (p = noDomains; p <= nprocs_i; p++) |
| vtxdist_o[p] = k; |
| } |
| #if ( DEBUGlevel>=2 ) |
| if (!iam) |
| PrintInt10 ("vtxdist_o", nprocs_i + 1, vtxdist_o); |
| #endif |
| /* Compute distributed A + A' */ |
| if ((apat_mem_l = |
| a_plus_at_CompRow_loc(iam, perm_r, nprocs_i, vtxdist_i, |
| n, rowptr, colind, noDomains, vtxdist_o, |
| &bnz, &b_rowptr, &b_colind, grid)) > 0) |
| return (apat_mem_l); |
| mem += -apat_mem_l; |
| /* Initialize and allocate storage for parMetis. */ |
| (*sizes) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t)); |
| if (!(*sizes)) ABORT("SUPERLU_MALLOC fails for sizes."); |
| l_sizes = *sizes; |
| (*fstVtxSep) = (int_t *) SUPERLU_MALLOC(2 * noDomains * sizeof(int_t)); |
| if (!(*fstVtxSep)) ABORT("SUPERLU_MALLOC fails for fstVtxSep."); |
| l_fstVtxSep = *fstVtxSep; |
| m_loc = vtxdist_o[iam+1] - vtxdist_o[iam]; |
| if ( iam < noDomains) |
| /* dist_order_int is the perm returned by parMetis, distributed */ |
| if (! (dist_order_int = (int *) SUPERLU_MALLOC(m_loc * sizeof(int)))) |
| ABORT("SUPERLU_MALLOC fails for dist_order_int."); |
| /* ParMETIS represents the column pointers and row indices of * |
| * the input matrix using integers. When SuperLU_DIST uses * |
| * long int for the int_t type, then several supplementary * |
| * copies need to be performed in order to call ParMETIS. */ |
| #if defined (_LONGINT) |
| l_sizes_int = (int *) SUPERLU_MALLOC(2 * noDomains * sizeof(int)); |
| if (!(l_sizes_int)) ABORT("SUPERLU_MALLOC fails for l_sizes_int."); |
| /* Allocate storage */ |
| if ( !(b_rowptr_int = (int*) SUPERLU_MALLOC((m_loc+1) * sizeof(int)))) |
| ABORT("SUPERLU_MALLOC fails for b_rowptr_int[]"); |
| for (i = 0; i <= m_loc; i++) |
| b_rowptr_int[i] = b_rowptr[i]; |
| SUPERLU_FREE (b_rowptr); |
| if ( bnz ) { |
| if ( !(b_colind_int = (int *) SUPERLU_MALLOC( bnz * sizeof(int)))) |
| ABORT("SUPERLU_MALLOC fails for b_colind_int[]"); |
| for (i = 0; i < bnz; i++) |
| b_colind_int[i] = b_colind[i]; |
| SUPERLU_FREE (b_colind); |
| } |
| if ( !(vtxdist_o_int = |
| (int *) SUPERLU_MALLOC((nprocs_i+1) * sizeof(int)))) |
| ABORT("SUPERLU_MALLOC fails for vtxdist_o_int."); |
| for (i = 0; i <= nprocs_i; i++) |
| vtxdist_o_int[i] = vtxdist_o[i]; |
| SUPERLU_FREE (vtxdist_o); |
| #else /* Default */ |
| vtxdist_o_int = vtxdist_o; |
| b_rowptr_int = b_rowptr; b_colind_int = b_colind; |
| l_sizes_int = l_sizes; |
| #endif |
| if ( iam < noDomains) { |
| options = (int *) SUPERLU_MALLOC(4 * sizeof(int)); |
| options[0] = 0; |
| options[1] = 0; |
| options[2] = 0; |
| options[3] = 1; |
| ParMETIS_V3_NodeND(vtxdist_o_int, b_rowptr_int, b_colind_int, |
| &numflag, options, |
| dist_order_int, l_sizes_int, metis_comm); |
| } |
| if (bnz) |
| SUPERLU_FREE (b_colind_int); |
| if ( iam < noDomains) { |
| SUPERLU_FREE (options); |
| } |
| SUPERLU_FREE (b_rowptr_int); |
| #if defined (_LONGINT) |
| /* Copy data from dist_order_int to dist_order */ |
| if ( iam < noDomains) { |
| /* dist_order is the perm returned by parMetis, distributed */ |
| if (!(dist_order = (int_t *) SUPERLU_MALLOC(m_loc * sizeof(int_t)))) |
| ABORT("SUPERLU_MALLOC fails for dist_order."); |
| for (i = 0; i < m_loc; i++) |
| dist_order[i] = dist_order_int[i]; |
| SUPERLU_FREE(dist_order_int); |
| for (i = 0; i < 2*noDomains; i++) |
| l_sizes[i] = l_sizes_int[i]; |
| SUPERLU_FREE(l_sizes_int); |
| } |
| #else |
| dist_order = dist_order_int; |
| #endif |
| /* Allgatherv dist_order to get perm_c */ |
| if (!(displs = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int)))) |
| ABORT ("SUPERLU_MALLOC fails for displs."); |
| if ( !(recvcnts = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int)))) |
| ABORT ("SUPERLU_MALLOC fails for recvcnts."); |
| for (i = 0; i < nprocs_i; i++) |
| recvcnts[i] = vtxdist_o_int[i+1] - vtxdist_o_int[i]; |
| displs[0]=0; |
| for(i=1; i < nprocs_i; i++) |
| displs[i] = displs[i-1] + recvcnts[i-1]; |
| MPI_Allgatherv (dist_order, m_loc, mpi_int_t, perm_c, recvcnts, displs, |
| mpi_int_t, grid->comm); |
| if ( iam < noDomains) { |
| SUPERLU_FREE (dist_order); |
| } |
| SUPERLU_FREE (vtxdist_i); |
| SUPERLU_FREE (vtxdist_o_int); |
| SUPERLU_FREE (recvcnts); |
| SUPERLU_FREE (displs); |
| /* send l_sizes to every processor p >= noDomains */ |
| if (!iam) |
| for (p = noDomains; p < nprocs_i; p++) |
| MPI_Send (l_sizes, 2*noDomains, mpi_int_t, p, 0, grid->comm); |
| if (noDomains <= iam && iam < nprocs_i) |
| MPI_Recv (l_sizes, 2*noDomains, mpi_int_t, 0, 0, grid->comm, |
| &status); |
| /* Determine the first node in each separator, store it in l_fstVtxSep */ |
| for (j = 0; j < 2 * noDomains; j++) |
| l_fstVtxSep[j] = 0; |
| l_fstVtxSep[2*noDomains - 2] = l_sizes[2*noDomains - 2]; |
| szSep = noDomains; |
| i = 0; |
| while (szSep != 1) { |
| for (j = i; j < i + szSep; j++) { |
| l_fstVtxSep[j] += l_sizes[j]; |
| } |
| for (j = i; j < i + szSep; j++) { |
| k = i + szSep + (j-i) / 2; |
| l_fstVtxSep[k] += l_fstVtxSep[j]; |
| } |
| i += szSep; |
| szSep = szSep / 2; |
| } |
| l_fstVtxSep[2 * noDomains - 2] -= l_sizes[2 * noDomains - 2]; |
| i = 2 * noDomains - 2; |
| szSep = 1; |
| while (i > 0) { |
| for (j = i; j < i + szSep; j++) { |
| k = (i - 2 * szSep) + (j-i) * 2 + 1; |
| noNodes = l_fstVtxSep[k]; |
| l_fstVtxSep[k] = l_fstVtxSep[j] - l_sizes[k]; |
| l_fstVtxSep[k-1] = l_fstVtxSep[k] + l_sizes[k] - |
| noNodes - l_sizes[k-1]; |
| } |
| szSep *= 2; |
| i -= szSep; |
| } |
| #if ( PRNTlevel>=2 ) |
| if (!iam ) { |
| PrintInt10 ("Sizes of separators", 2 * noDomains-1, l_sizes); |
| PrintInt10 ("First Vertex Separator", 2 * noDomains-1, l_fstVtxSep); |
| } |
| #endif |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Exit get_perm_c_parmetis()"); |
| #endif |
| return (-mem); |
| } /* get_perm_c_parmetis */ |
| /*! \brief |
| * |
| * <pre> |
| * Purpose |
| * ======= |
| * |
| * Form the structure of Pr*A +A'Pr'. A is an n-by-n matrix in |
| * NRformat_loc format, represented by (rowptr, colind). The output |
| * B=Pr*A +A'Pr' is in NRformat_loc format (symmetrically, also row |
| * oriented), represented by (b_rowptr, b_colind). |
| * |
| * The input matrix A is distributed in block row format on nprocs_i |
| * processors. The output matrix B is distributed in block row format |
| * on nprocs_o processors, where nprocs_o <= nprocs_i. On output, the |
| * matrix B has its rows permuted according to perm_r. |
| * |
| * Sketch of the algorithm |
| * ======================= |
| * |
| * Let iam by my process number. Let fst_row, lst_row = m_loc + |
| * fst_row be the first/last row stored on iam. |
| * |
| * Compute Pr' - the inverse row permutation, stored in iperm_r. |
| * |
| * Compute the transpose of the block row of Pr*A that iam owns: |
| * T[:,Pr(fst_row:lst_row)] = Pr' * A[:,fst_row:lst_row] * Pr' |
| * |
| * |
| * All to all communication such that every processor iam receives all |
| * the blocks of the transpose matrix that it needs, that is |
| * T[fst_row:lst_row, :] |
| * |
| * Compute B = A[fst_row:lst_row, :] + T[fst_row:lst_row, :] |
| * |
| * If Pr != I or nprocs_i != nprocs_o then permute the rows of B (that |
| * is compute Pr*B) and redistribute from nprocs_i to nprocs_o |
| * according to the block row distribution in vtxdist_i, vtxdist_o. |
| * </pre> |
| */ |
| static float |
| a_plus_at_CompRow_loc |
| ( |
| int iam, /* Input - my processor number */ |
| int_t *perm_r, /* Input - row permutation vector Pr */ |
| int nprocs_i, /* Input - number of processors the input matrix |
| is distributed on */ |
| int_t *vtxdist_i, /* Input - index of first row on each processor of the input matrix */ |
| int_t n, /* Input - number of columns in matrix A. */ |
| int_t *rowptr, /* Input - row pointers of size m_loc+1 for matrix A. */ |
| int_t *colind, /* Input - column indices of size nnz_loc for matrix A. */ |
| int nprocs_o, /* Input - number of processors the output matrix |
| is distributed on */ |
| int_t *vtxdist_o, /* Input - index of first row on each processor of the output matrix */ |
| int_t *p_bnz, /* Output - on exit, returns the actual number of |
| local nonzeros in matrix A'+A. */ |
| int_t **p_b_rowptr, /* Output - output matrix, row pointers of size m_loc+1 */ |
| int_t **p_b_colind, /* Output - output matrix, column indices of size *p_bnz */ |
| gridinfo_t *grid /* Input - grid of processors information */ |
| ) |
| { |
| int_t i, j, k, col, num_nz, nprocs; |
| int_t *tcolind_recv; /* temporary receive buffer */ |
| int_t *tcolind_send; /* temporary send buffer */ |
| int_t sz_tcolind_send, sz_tcolind_loc, sz_tcolind_recv; |
| int_t ind, ind_tmp, ind_rcv; |
| int redist_pra; /* TRUE if Pr != I or nprocs_i != nprocs_o */ |
| int_t *marker, *iperm_r; |
| int_t *sendCnts, *recvCnts; |
| int_t *sdispls, *rdispls; |
| int_t bnz, *b_rowptr, *b_colind, bnz_t, *b_rowptr_t, *b_colind_t; |
| int_t p, t_ind, nelts, ipcol; |
| int_t m_loc, m_loc_o; /* number of local rows */ |
| int_t fst_row, fst_row_o; /* index of first local row */ |
| int_t nnz_loc; /* number of local nonzeros in matrix A */ |
| float apat_mem, apat_mem_max; |
| int *intBuf1, *intBuf2, *intBuf3, *intBuf4; |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Enter a_plus_at_CompRow_loc()"); |
| #endif |
| fst_row = vtxdist_i[iam]; |
| m_loc = vtxdist_i[iam+1] - vtxdist_i[iam]; |
| nnz_loc = rowptr[m_loc]; |
| redist_pra = FALSE; |
| nprocs = SUPERLU_MAX(nprocs_i, nprocs_o); |
| apat_mem_max = 0.; |
| if (!(marker = (int_t*) SUPERLU_MALLOC( (n+1) * sizeof(int_t)))) |
| ABORT("SUPERLU_MALLOC fails for marker[]"); |
| if (!(iperm_r = (int_t*) SUPERLU_MALLOC( n * sizeof(int_t)))) |
| ABORT("SUPERLU_MALLOC fails for iperm_r[]"); |
| if (!(sendCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t)))) |
| ABORT("SUPERLU_MALLOC fails for sendCnts[]"); |
| if (!(recvCnts = (int_t*) SUPERLU_MALLOC(nprocs * sizeof(int_t)))) |
| ABORT("SUPERLU_MALLOC fails for recvCnts[]"); |
| if (!(sdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t)))) |
| ABORT("SUPERLU_MALLOC fails for sdispls[]"); |
| if (!(rdispls = (int_t*) SUPERLU_MALLOC((nprocs+1) * sizeof(int_t)))) |
| ABORT("SUPERLU_MALLOC fails for rdispls[]"); |
| apat_mem = 2 * n + 4 * nprocs + 3; |
| #if defined (_LONGINT) |
| intBuf1 = (int *) SUPERLU_MALLOC(4 * nprocs * sizeof(int)); |
| intBuf2 = intBuf1 + nprocs; |
| intBuf3 = intBuf1 + 2 * nprocs; |
| intBuf4 = intBuf1 + 3 * nprocs; |
| apat_mem += 4*nprocs*sizeof(int) / sizeof(int_t); |
| #endif |
| /* compute the inverse row permutation vector */ |
| for (i = 0; i < n; i++) { |
| marker[i] = 1; |
| if (perm_r[i] != i) |
| redist_pra = TRUE; |
| iperm_r[perm_r[i]] = i; |
| } |
| /* TRANSPOSE LOCAL ROWS ON MY PROCESSOR iam. */ |
| /* THE RESULT IS STORED IN TCOLIND_SEND. */ |
| /* THIS COUNTS FOR TWO PASSES OF THE LOCAL MATRIX. */ |
| /* First pass to get counts of each row of T, and set up column pointers */ |
| for (j = 0; j < m_loc; j++) { |
| for (i = rowptr[j]; i < rowptr[j+1]; i++){ |
| marker[iperm_r[colind[i]]]++; |
| } |
| } |
| /* determine number of elements to be sent to each processor */ |
| for (p = 0; p < nprocs_i; p++) { |
| sendCnts[p] = 0; |
| for (i = vtxdist_i[p]; i < vtxdist_i[p+1]; i++) |
| sendCnts[p] += marker[i]; |
| } |
| /* exchange send/receive counts information in between all processors */ |
| MPI_Alltoall (sendCnts, 1, mpi_int_t, |
| recvCnts, 1, mpi_int_t, grid->comm); |
| sendCnts[iam] = 0; |
| sz_tcolind_loc = recvCnts[iam]; |
| for (i = 0, j = 0, p = 0; p < nprocs_i; p++) { |
| rdispls[p] = j; |
| j += recvCnts[p]; |
| sdispls[p] = i; |
| i += sendCnts[p]; |
| } |
| recvCnts[iam] = 0; |
| sz_tcolind_recv = j; |
| sz_tcolind_send = i; |
| /* allocate memory to receive necessary blocks of transpose matrix T */ |
| if (sz_tcolind_recv) { |
| if ( !(tcolind_recv = (int_t*) SUPERLU_MALLOC( sz_tcolind_recv |
| * sizeof(int_t) ))) |
| ABORT("SUPERLU_MALLOC fails tcolind_recv[]"); |
| apat_mem += sz_tcolind_recv; |
| } |
| /* allocate memory to send blocks of local transpose matrix T to other processors */ |
| if (sz_tcolind_send) { |
| if (!(tcolind_send = (int_t*) SUPERLU_MALLOC( (sz_tcolind_send) |
| * sizeof(int_t)))) |
| ABORT("SUPERLU_MALLOC fails for tcolind_send[]"); |
| apat_mem += sz_tcolind_send; |
| } |
| /* Set up marker[] to point at the beginning of each row in the |
| send/receive buffer. For each row, we store first its number of |
| elements, and then the elements. */ |
| ind_rcv = rdispls[iam]; |
| for (p = 0; p < nprocs_i; p++) { |
| for (i = vtxdist_i[p]; i < vtxdist_i[p+1]; i++) { |
| nelts = marker[i] - 1; |
| if (p == iam) { |
| tcolind_recv[ind_rcv] = nelts; |
| marker[i] = ind_rcv + 1; |
| ind_rcv += nelts + 1; |
| } |
| else { |
| tcolind_send[sdispls[p]] = nelts; |
| marker[i] = sdispls[p] + 1; |
| sdispls[p] += nelts + 1; |
| } |
| } |
| } |
| /* reset sdispls vector */ |
| for (i = 0, p = 0; p < nprocs_i; p++) { |
| sdispls[p] = i; |
| i += sendCnts[p]; |
| } |
| /* Second pass of the local matrix A to copy data to be send */ |
| for (j = 0; j < m_loc; j++) |
| for (i = rowptr[j]; i < rowptr[j+1]; i++) { |
| col = colind[i]; |
| ipcol = iperm_r[col]; |
| if (ipcol >= fst_row && ipcol < fst_row + m_loc) /* local data */ |
| tcolind_recv[marker[ipcol]] = perm_r[j + fst_row]; |
| else /* remote */ |
| tcolind_send[marker[ipcol]] = perm_r[j + fst_row]; |
| marker[ipcol] ++; |
| } |
| sendCnts[iam] = 0; |
| recvCnts[iam] = 0; |
| #if defined (_LONGINT) |
| for (p=0; p<nprocs; p++) { |
| if (sendCnts[p] > INT_MAX || sdispls[p] > INT_MAX || |
| recvCnts[p] > INT_MAX || rdispls[p] > INT_MAX) |
| ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); |
| intBuf1[p] = (int) sendCnts[p]; |
| intBuf2[p] = (int) sdispls[p]; |
| intBuf3[p] = (int) recvCnts[p]; |
| intBuf4[p] = (int) rdispls[p]; |
| } |
| #else /* Default */ |
| intBuf1 = sendCnts; intBuf2 = sdispls; |
| intBuf3 = recvCnts; intBuf4 = rdispls; |
| #endif |
| /* send/receive transpose matrix T */ |
| MPI_Alltoallv (tcolind_send, intBuf1, intBuf2, mpi_int_t, |
| tcolind_recv, intBuf3, intBuf4, mpi_int_t, |
| grid->comm); |
| /* ------------------------------------------------------------ |
| DEALLOCATE SEND COMMUNICATION STORAGE |
| ------------------------------------------------------------*/ |
| if (sz_tcolind_send) { |
| SUPERLU_FREE( tcolind_send ); |
| apat_mem_max = apat_mem; |
| apat_mem -= sz_tcolind_send; |
| } |
| /* ---------------------------------------------------------------- |
| FOR LOCAL ROWS: |
| compute B = A + T, where row j of B is: |
| Struct (B(j,:)) = Struct (A(j,:)) UNION Struct (T(j,:)) |
| do not include the diagonal entry |
| THIS COUNTS FOR TWO PASSES OF THE LOCAL ROWS OF A AND T. |
| ------------------------------------------------------------------ */ |
| /* Reset marker to EMPTY */ |
| for (i = 0; i < n; ++i) marker[i] = EMPTY; |
| /* save rdispls information */ |
| for (p = 0; p < nprocs_i; p++) |
| sdispls[p] = rdispls[p]; |
| /* First pass determines number of nonzeros in B */ |
| num_nz = 0; |
| for (j = 0; j < m_loc; j++) { |
| /* Flag the diagonal so it's not included in the B matrix */ |
| marker[perm_r[j + fst_row]] = j; |
| /* Add pattern of row A(j,:) to B(j,:) */ |
| for (i = rowptr[j]; i < rowptr[j+1]; i++) { |
| k = colind[i]; |
| if ( marker[k] != j ) { |
| marker[k] = j; |
| ++num_nz; |
| } |
| } |
| /* Add pattern of row T(j,:) to B(j,:) */ |
| for (p = 0; p < nprocs_i; p++) { |
| t_ind = rdispls[p]; |
| nelts = tcolind_recv[t_ind]; t_ind ++; |
| for (i = t_ind; i < t_ind + nelts; i++) { |
| k = tcolind_recv[i]; |
| if ( marker[k] != j ) { |
| marker[k] = j; |
| ++num_nz; |
| } |
| } |
| t_ind += nelts; |
| rdispls[p] = t_ind; |
| } |
| } |
| bnz_t = num_nz; |
| /* Allocate storage for B=Pr*A+A'*Pr' */ |
| if ( !(b_rowptr_t = (int_t*) SUPERLU_MALLOC((m_loc+1) * sizeof(int_t)))) |
| ABORT("SUPERLU_MALLOC fails for b_rowptr_t[]"); |
| if ( bnz_t ) { |
| if ( !(b_colind_t = (int_t*) SUPERLU_MALLOC( bnz_t * sizeof(int_t)))) |
| ABORT("SUPERLU_MALLOC fails for b_colind_t[]"); |
| } |
| apat_mem += m_loc + 1 + bnz_t; |
| if (apat_mem > apat_mem_max) |
| apat_mem_max = apat_mem; |
| /* Reset marker to EMPTY */ |
| for (i = 0; i < n; i++) marker[i] = EMPTY; |
| /* restore rdispls information */ |
| for (p = 0; p < nprocs_i; p++) |
| rdispls[p] = sdispls[p]; |
| /* Second pass, compute each row of B, one at a time */ |
| num_nz = 0; |
| t_ind = 0; |
| for (j = 0; j < m_loc; j++) { |
| b_rowptr_t[j] = num_nz; |
| /* Flag the diagonal so it's not included in the B matrix */ |
| marker[perm_r[j + fst_row]] = j; |
| /* Add pattern of row A(j,:) to B(j,:) */ |
| for (i = rowptr[j]; i < rowptr[j+1]; i++) { |
| k = colind[i]; |
| if ( marker[k] != j ) { |
| marker[k] = j; |
| b_colind_t[num_nz] = k; num_nz ++; |
| } |
| } |
| /* Add pattern of row T(j,:) to B(j,:) */ |
| for (p = 0; p < nprocs_i; p++) { |
| t_ind = rdispls[p]; |
| nelts = tcolind_recv[t_ind]; t_ind++; |
| for (i = t_ind; i < t_ind + nelts; i++) { |
| k = tcolind_recv[i]; |
| if ( marker[k] != j ) { |
| marker[k] = j; |
| b_colind_t[num_nz] = k; num_nz++; |
| } |
| } |
| t_ind += nelts; |
| rdispls[p] = t_ind; |
| } |
| } |
| b_rowptr_t[m_loc] = num_nz; |
| for (p = 0; p <= SUPERLU_MIN(nprocs_i, nprocs_o); p++) |
| if (vtxdist_i[p] != vtxdist_o[p]) |
| redist_pra = TRUE; |
| if (sz_tcolind_recv) { |
| SUPERLU_FREE (tcolind_recv); |
| apat_mem -= sz_tcolind_recv; |
| } |
| SUPERLU_FREE (marker); |
| SUPERLU_FREE (iperm_r); |
| apat_mem -= 2 * n + 1; |
| /* redistribute permuted matrix (by rows) from nproc_i processors |
| to nproc_o processors */ |
| if (redist_pra) { |
| m_loc_o = vtxdist_o[iam+1] - vtxdist_o[iam]; |
| fst_row_o = vtxdist_o[iam]; |
| nnz_loc = 0; |
| if ( !(b_rowptr = intMalloc_dist(m_loc_o + 1)) ) |
| ABORT("Malloc fails for *b_rowptr[]."); |
| apat_mem += m_loc_o + 1; |
| if (apat_mem > apat_mem_max) |
| apat_mem_max = apat_mem; |
| for (p = 0; p < nprocs_i; p++) { |
| sendCnts[p] = 0; |
| recvCnts[p] = 0; |
| } |
| for (i = 0; i < m_loc; i++) { |
| k = perm_r[i+fst_row]; |
| /* find the processor to which row k belongs */ |
| j = FALSE; p = 0; |
| while (!j) { |
| if (vtxdist_o[p] <= k && k < vtxdist_o[p+1]) |
| j = TRUE; |
| else |
| p ++; |
| } |
| if (p == iam) { |
| b_rowptr[k-fst_row_o] = b_rowptr_t[i + 1] - b_rowptr_t[i]; |
| nnz_loc += b_rowptr[k-fst_row_o]; |
| } |
| else |
| sendCnts[p] += b_rowptr_t[i + 1] - b_rowptr_t[i] + 2; |
| } |
| /* exchange send/receive counts information in between all processors */ |
| MPI_Alltoall (sendCnts, 1, mpi_int_t, |
| recvCnts, 1, mpi_int_t, grid->comm); |
| for (i = 0, j = 0, p = 0; p < nprocs_i; p++) { |
| rdispls[p] = j; |
| j += recvCnts[p]; |
| sdispls[p] = i; |
| i += sendCnts[p]; |
| } |
| rdispls[p] = j; |
| sdispls[p] = i; |
| sz_tcolind_recv = j; |
| sz_tcolind_send = i; |
| /* allocate memory for local data */ |
| tcolind_recv = NULL; |
| tcolind_send = NULL; |
| if (sz_tcolind_recv) { |
| if ( !(tcolind_recv = (int_t*) SUPERLU_MALLOC( sz_tcolind_recv |
| * sizeof(int_t) ))) |
| ABORT("SUPERLU_MALLOC fails tcolind_recv[]"); |
| apat_mem += sz_tcolind_recv; |
| } |
| /* allocate memory to receive necessary data */ |
| if (sz_tcolind_send) { |
| if (!(tcolind_send = (int_t*) SUPERLU_MALLOC( (sz_tcolind_send) |
| * sizeof(int_t)))) |
| ABORT("SUPERLU_MALLOC fails for tcolind_send[]"); |
| apat_mem += sz_tcolind_send; |
| } |
| if (apat_mem > apat_mem_max) |
| apat_mem_max = apat_mem; |
| /* Copy data to be send */ |
| ind_rcv = rdispls[iam]; |
| for (i = 0; i < m_loc; i++) { |
| k = perm_r[i+fst_row]; |
| /* find the processor to which row k belongs */ |
| j = FALSE; p = 0; |
| while (!j) { |
| if (vtxdist_o[p] <= k && k < vtxdist_o[p+1]) |
| j = TRUE; |
| else |
| p ++; |
| } |
| if (p != iam) { /* remote */ |
| tcolind_send[sdispls[p]] = k; |
| tcolind_send[sdispls[p]+1] = b_rowptr_t[i+1] - b_rowptr_t[i]; |
| sdispls[p] += 2; |
| for (j = b_rowptr_t[i]; j < b_rowptr_t[i+1]; j++) { |
| tcolind_send[sdispls[p]] = b_colind_t[j]; sdispls[p] ++; |
| } |
| } |
| } |
| /* reset sdispls vector */ |
| for (i = 0, p = 0; p < nprocs_i; p++) { |
| sdispls[p] = i; |
| i += sendCnts[p]; |
| } |
| sendCnts[iam] = 0; |
| recvCnts[iam] = 0; |
| #if defined (_LONGINT) |
| for (p=0; p<nprocs; p++) { |
| if (sendCnts[p] > INT_MAX || sdispls[p] > INT_MAX || |
| recvCnts[p] > INT_MAX || rdispls[p] > INT_MAX) |
| ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); |
| intBuf1[p] = (int) sendCnts[p]; |
| intBuf2[p] = (int) sdispls[p]; |
| intBuf3[p] = (int) recvCnts[p]; |
| intBuf4[p] = (int) rdispls[p]; |
| } |
| #else /* Default */ |
| intBuf1 = sendCnts; intBuf2 = sdispls; |
| intBuf3 = recvCnts; intBuf4 = rdispls; |
| #endif |
| /* send/receive permuted matrix T by rows */ |
| MPI_Alltoallv (tcolind_send, intBuf1, intBuf2, mpi_int_t, |
| tcolind_recv, intBuf3, intBuf4, mpi_int_t, |
| grid->comm); |
| /* ------------------------------------------------------------ |
| DEALLOCATE COMMUNICATION STORAGE |
| ------------------------------------------------------------*/ |
| if (sz_tcolind_send) { |
| SUPERLU_FREE( tcolind_send ); |
| apat_mem -= sz_tcolind_send; |
| } |
| /* ------------------------------------------------------------ |
| STORE ROWS IN ASCENDING ORDER OF THEIR NUMBER |
| ------------------------------------------------------------*/ |
| for (p = 0; p < nprocs; p++) { |
| if (p != iam) { |
| i = rdispls[p]; |
| while (i < rdispls[p+1]) { |
| j = tcolind_recv[i]; |
| nelts = tcolind_recv[i+1]; |
| i += 2 + nelts; |
| b_rowptr[j-fst_row_o] = nelts; |
| nnz_loc += nelts; |
| } |
| } |
| } |
| if (nnz_loc) |
| if ( !(b_colind = intMalloc_dist(nnz_loc)) ) { |
| ABORT("Malloc fails for bcolind[]."); |
| apat_mem += nnz_loc; |
| if (apat_mem > apat_mem_max) |
| apat_mem_max = apat_mem; |
| } |
| /* Initialize the array of row pointers */ |
| k = 0; |
| for (j = 0; j < m_loc_o; j++) { |
| i = b_rowptr[j]; |
| b_rowptr[j] = k; |
| k += i; |
| } |
| if (m_loc_o) b_rowptr[j] = k; |
| /* Copy the data into the row oriented storage */ |
| for (p = 0; p < nprocs; p++) { |
| if (p != iam) { |
| i = rdispls[p]; |
| while (i < rdispls[p+1]) { |
| j = tcolind_recv[i]; |
| nelts = tcolind_recv[i+1]; |
| for (i += 2, k = b_rowptr[j-fst_row_o]; |
| k < b_rowptr[j-fst_row_o+1]; i++, k++) |
| b_colind[k] = tcolind_recv[i]; |
| } |
| } |
| } |
| for (i = 0; i < m_loc; i++) { |
| k = perm_r[i+fst_row]; |
| if (k >= vtxdist_o[iam] && k < vtxdist_o[iam+1]) { |
| ind = b_rowptr[k-fst_row_o]; |
| for (j = b_rowptr_t[i]; j < b_rowptr_t[i+1]; j++, ind++) |
| b_colind[ind] = b_colind_t[j]; |
| } |
| } |
| SUPERLU_FREE(b_rowptr_t); |
| if ( bnz_t ) |
| SUPERLU_FREE(b_colind_t); |
| if (sz_tcolind_recv) |
| SUPERLU_FREE(tcolind_recv); |
| apat_mem -= bnz_t + m_loc + sz_tcolind_recv; |
| *p_bnz = nnz_loc; |
| *p_b_rowptr = b_rowptr; |
| *p_b_colind = b_colind; |
| } |
| else { |
| *p_bnz = bnz_t; |
| *p_b_rowptr = b_rowptr_t; |
| *p_b_colind = b_colind_t; |
| } |
| SUPERLU_FREE (rdispls); |
| SUPERLU_FREE (sdispls); |
| SUPERLU_FREE (sendCnts); |
| SUPERLU_FREE (recvCnts); |
| apat_mem -= 4 * nprocs + 2; |
| #if defined (_LONGINT) |
| SUPERLU_FREE (intBuf1); |
| apat_mem -= 4*nprocs*sizeof(int) / sizeof(int_t); |
| #endif |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Exit a_plus_at_CompRow_loc()"); |
| #endif |
| return (- apat_mem_max * sizeof(int_t)); |
| } /* a_plus_at_CompRow_loc */ |
| Property changes: |
| Added: svn:executable |
| + * |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/pdgsequ.c |
|---|
| New file |
| 0,0 → 1,235 |
| /*! @file |
| * \brief Computes row and column scalings |
| * |
| * File name: pdgsequ.c |
| * History: Modified from LAPACK routine DGEEQU |
| */ |
| #include <math.h> |
| #include "superlu_ddefs.h" |
| /*! \brief |
| <pre> |
| Purpose |
| ======= |
| PDGSEQU computes row and column scalings intended to equilibrate an |
| M-by-N sparse matrix A and reduce its condition number. R returns the row |
| scale factors and C the column scale factors, chosen to try to make |
| the largest element in each row and column of the matrix B with |
| elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1. |
| R(i) and C(j) are restricted to be between SMLNUM = smallest safe |
| number and BIGNUM = largest safe number. Use of these scaling |
| factors is not guaranteed to reduce the condition number of A but |
| works well in practice. |
| See supermatrix.h for the definition of 'SuperMatrix' structure. |
| Arguments |
| ========= |
| A (input) SuperMatrix* |
| The matrix of dimension (A->nrow, A->ncol) whose equilibration |
| factors are to be computed. The type of A can be: |
| Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. |
| R (output) double*, size A->nrow |
| If INFO = 0 or INFO > M, R contains the row scale factors |
| for A. |
| C (output) double*, size A->ncol |
| If INFO = 0, C contains the column scale factors for A. |
| ROWCND (output) double* |
| If INFO = 0 or INFO > M, ROWCND contains the ratio of the |
| smallest R(i) to the largest R(i). If ROWCND >= 0.1 and |
| AMAX is neither too large nor too small, it is not worth |
| scaling by R. |
| COLCND (output) double* |
| If INFO = 0, COLCND contains the ratio of the smallest |
| C(i) to the largest C(i). If COLCND >= 0.1, it is not |
| worth scaling by C. |
| AMAX (output) double* |
| Absolute value of largest matrix element. If AMAX is very |
| close to overflow or very close to underflow, the matrix |
| should be scaled. |
| INFO (output) int* |
| = 0: successful exit |
| < 0: if INFO = -i, the i-th argument had an illegal value |
| > 0: if INFO = i, and i is |
| <= M: the i-th row of A is exactly zero |
| > M: the (i-M)-th column of A is exactly zero |
| GRID (input) gridinof_t* |
| The 2D process mesh. |
| ===================================================================== |
| </pre> |
| */ |
| void |
| pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, |
| double *colcnd, double *amax, int_t *info, gridinfo_t *grid) |
| { |
| /* Local variables */ |
| NRformat_loc *Astore; |
| double *Aval; |
| int i, j, irow, jcol, m_loc; |
| double rcmin, rcmax; |
| double bignum, smlnum; |
| extern double dlamch_(char *); |
| double tempmax, tempmin; |
| double *loc_max; |
| int *r_sizes, *displs; |
| double *loc_r; |
| int_t procs; |
| /* Test the input parameters. */ |
| *info = 0; |
| if ( A->nrow < 0 || A->ncol < 0 || |
| A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE ) |
| *info = -1; |
| if (*info != 0) { |
| i = -(*info); |
| xerbla_("pdgsequ", &i); |
| return; |
| } |
| /* Quick return if possible */ |
| if ( A->nrow == 0 || A->ncol == 0 ) { |
| *rowcnd = 1.; |
| *colcnd = 1.; |
| *amax = 0.; |
| return; |
| } |
| Astore = A->Store; |
| Aval = Astore->nzval; |
| m_loc = Astore->m_loc; |
| /* Get machine constants. */ |
| smlnum = dlamch_("S"); |
| bignum = 1. / smlnum; |
| /* Compute row scale factors. */ |
| for (i = 0; i < A->nrow; ++i) r[i] = 0.; |
| /* Find the maximum element in each row. */ |
| irow = Astore->fst_row; |
| for (i = 0; i < m_loc; ++i) { |
| for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) |
| r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[j]) ); |
| ++irow; |
| } |
| /* Find the maximum and minimum scale factors. */ |
| rcmin = bignum; |
| rcmax = 0.; |
| for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) { |
| rcmax = SUPERLU_MAX(rcmax, r[i]); |
| rcmin = SUPERLU_MIN(rcmin, r[i]); |
| } |
| /* Get the global MAX and MIN for R */ |
| tempmax = rcmax; |
| tempmin = rcmin; |
| MPI_Allreduce( &tempmax, &rcmax, |
| 1, MPI_DOUBLE, MPI_MAX, grid->comm); |
| MPI_Allreduce( &tempmin, &rcmin, |
| 1, MPI_DOUBLE, MPI_MIN, grid->comm); |
| *amax = rcmax; |
| if (rcmin == 0.) { |
| /* Find the first zero scale factor and return an error code. */ |
| for (i = 0; i < A->nrow; ++i) |
| if (r[i] == 0.) { |
| *info = i + 1; |
| return; |
| } |
| } else { |
| /* Invert the scale factors. */ |
| for (i = 0; i < A->nrow; ++i) |
| r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); |
| /* Compute ROWCND = min(R(I)) / max(R(I)) */ |
| *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); |
| } |
| /* Compute column scale factors */ |
| for (j = 0; j < A->ncol; ++j) c[j] = 0.; |
| /* Find the maximum element in each column, assuming the row |
| scalings computed above. */ |
| irow = Astore->fst_row; |
| for (i = 0; i < m_loc; ++i) { |
| for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { |
| jcol = Astore->colind[j]; |
| c[jcol] = SUPERLU_MAX( c[jcol], fabs(Aval[j]) * r[irow] ); |
| } |
| ++irow; |
| } |
| /* Find the global maximum for c[j] */ |
| if ( !(loc_max = doubleMalloc_dist(A->ncol))) |
| ABORT("Malloc fails for loc_max[]."); |
| for (j = 0; j < A->ncol; ++j) loc_max[j] = c[j]; |
| MPI_Allreduce(loc_max, c, A->ncol, MPI_DOUBLE, MPI_MAX, grid->comm); |
| SUPERLU_FREE(loc_max); |
| /* Find the maximum and minimum scale factors. */ |
| rcmin = bignum; |
| rcmax = 0.; |
| for (j = 0; j < A->ncol; ++j) { |
| rcmax = SUPERLU_MAX(rcmax, c[j]); |
| rcmin = SUPERLU_MIN(rcmin, c[j]); |
| } |
| if (rcmin == 0.) { |
| /* Find the first zero scale factor and return an error code. */ |
| for (j = 0; j < A->ncol; ++j) |
| if ( c[j] == 0. ) { |
| *info = A->nrow + j + 1; |
| return; |
| } |
| } else { |
| /* Invert the scale factors. */ |
| for (j = 0; j < A->ncol; ++j) |
| c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); |
| /* Compute COLCND = min(C(J)) / max(C(J)) */ |
| *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); |
| } |
| /* gather R from each process to get the global R. */ |
| procs = grid->nprow * grid->npcol; |
| if ( !(r_sizes = SUPERLU_MALLOC(2 * procs * sizeof(int)))) |
| ABORT("Malloc fails for r_sizes[]."); |
| displs = r_sizes + procs; |
| if ( !(loc_r = doubleMalloc_dist(m_loc))) |
| ABORT("Malloc fails for loc_r[]."); |
| j = Astore->fst_row; |
| for (i = 0; i < m_loc; ++i) loc_r[i] = r[j++]; |
| /* First gather the size of each piece. */ |
| MPI_Allgather(&m_loc, 1, MPI_INT, r_sizes, 1, MPI_INT, grid->comm); |
| /* Set up the displacements for allgatherv */ |
| displs[0] = 0; |
| for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1]; |
| /* Now gather the actual data */ |
| MPI_Allgatherv(loc_r, m_loc, MPI_DOUBLE, r, r_sizes, displs, |
| MPI_DOUBLE, grid->comm); |
| SUPERLU_FREE(r_sizes); |
| SUPERLU_FREE(loc_r); |
| return; |
| } /* pdgsequ */ |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/pdsymbfact_distdata.c |
|---|
| New file |
| 0,0 → 1,1961 |
| /*! @file |
| * \brief Redistribute the symbolic structure of L and U from the distribution |
| * |
| * <pre> |
| * -- Parallel symbolic factorization auxialiary routine (version 2.3) -- |
| * -- Distributes the data from parallel symbolic factorization |
| * -- to numeric factorization |
| * INRIA France - July 1, 2004 |
| * Laura Grigori |
| * |
| * November 1, 2007 |
| * Feburary 20, 2008 |
| * October 15, 2008 |
| * </pre> |
| */ |
| /* limits.h: the largest positive integer (INT_MAX) */ |
| #include <limits.h> |
| #include "superlu_ddefs.h" |
| #include "psymbfact.h" |
| /*! \brief |
| * |
| * <pre> |
| * Purpose |
| * ======= |
| * |
| * Redistribute the symbolic structure of L and U from the distribution |
| * used in the parallel symbolic factorization step to the distdibution |
| * used in the parallel numeric factorization step. On exit, the L and U |
| * structure for the 2D distribution used in the numeric factorization step is |
| * stored in p_xlsub, p_lsub, p_xusub, p_usub. The global supernodal |
| * information is also computed and it is stored in Glu_persist->supno |
| * and Glu_persist->xsup. |
| * |
| * This routine allocates memory for storing the structure of L and U |
| * and the supernodes information. This represents the arrays: |
| * p_xlsub, p_lsub, p_xusub, p_usub, |
| * Glu_persist->supno, Glu_persist->xsup. |
| * |
| * This routine also deallocates memory allocated during symbolic |
| * factorization routine. That is, the folloing arrays are freed: |
| * Pslu_freeable->xlsub, Pslu_freeable->lsub, |
| * Pslu_freeable->xusub, Pslu_freeable->usub, |
| * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc, |
| * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc. |
| * |
| * Arguments |
| * ========= |
| * |
| * n (Input) int_t |
| * Order of the input matrix |
| * Pslu_freeable (Input) Pslu_freeable_t * |
| * Local L and U structure, |
| * global to local indexing information. |
| * |
| * Glu_persist (Output) Glu_persist_t * |
| * Stores on output the information on supernodes mapping. |
| * |
| * p_xlsub (Output) int_t ** |
| * Pointer to structure of L distributed on a 2D grid |
| * of processors, stored by columns. |
| * |
| * p_lsub (Output) int_t ** |
| * Structure of L distributed on a 2D grid of processors, |
| * stored by columns. |
| * |
| * p_xusub (Output) int_t ** |
| * Pointer to structure of U distributed on a 2D grid |
| * of processors, stored by rows. |
| * |
| * p_usub (Output) int_t ** |
| * Structure of U distributed on a 2D grid of processors, |
| * stored by rows. |
| * |
| * grid (Input) gridinfo_t* |
| * The 2D process mesh. |
| * |
| * Return value |
| * ============ |
| * < 0, number of bytes allocated on return from the dist_symbLU. |
| * > 0, number of bytes allocated in this routine when out of memory. |
| * (an approximation). |
| * </pre> |
| */ |
| static float |
| dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, |
| Glu_persist_t *Glu_persist, |
| int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub, |
| gridinfo_t *grid |
| ) |
| { |
| int iam, nprocs, pc, pr, p, np, p_diag; |
| int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, |
| *tmp_ptrToSend, *mem; |
| int_t *nnzToRecv_l, *nnzToRecv_u; |
| int_t *send_1, *send_2, nsend_1, nsend_2; |
| int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind; |
| int_t nsupers, nsupers_i, nsupers_j; |
| int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc; |
| int_t maxszsn, maxNvtcsPProc; |
| int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s; |
| int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s; |
| int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n; |
| int_t *xsub_s, *sub_s, *xsub_n, *sub_n; |
| int_t *globToLoc, nvtcs_loc; |
| int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc, |
| RecvCnt_l, RecvCnt_u, ind_loc; |
| int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc; |
| int_t nelts, isize; |
| float memAux; /* Memory used during this routine and freed on return */ |
| float memRet; /* Memory allocated and not freed on return */ |
| int_t iword, dword; |
| /* ------------------------------------------------------------ |
| INITIALIZATION. |
| ------------------------------------------------------------*/ |
| iam = grid->iam; |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Enter dist_symbLU()"); |
| #endif |
| nprocs = (int) grid->nprow * grid->npcol; |
| xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub; |
| xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub; |
| maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; |
| globToLoc = Pslu_freeable->globToLoc; |
| nvtcs_loc = Pslu_freeable->nvtcs_loc; |
| xsup_beg_s = Pslu_freeable->xsup_beg_loc; |
| xsup_end_s = Pslu_freeable->xsup_end_loc; |
| supno_s = Pslu_freeable->supno_loc; |
| rcv_luind = NULL; |
| iword = sizeof(int_t); |
| dword = sizeof(double); |
| memAux = 0.; memRet = 0.; |
| mem = intCalloc_dist(12 * nprocs); |
| if (!mem) |
| return (ERROR_RET); |
| memAux = (float) (12 * nprocs * sizeof(int_t)); |
| nnzToRecv = mem; |
| nnzToSend = nnzToRecv + 2*nprocs; |
| nnzToSend_l = nnzToSend + 2 * nprocs; |
| nnzToSend_u = nnzToSend_l + nprocs; |
| send_1 = nnzToSend_u + nprocs; |
| send_2 = send_1 + nprocs; |
| tmp_ptrToSend = send_2 + nprocs; |
| nnzToRecv_l = tmp_ptrToSend + nprocs; |
| nnzToRecv_u = nnzToRecv_l + nprocs; |
| ptrToSend = nnzToSend; |
| ptrToRecv = nnzToSend + nprocs; |
| nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int)); |
| intBuf1 = nvtcs + nprocs; |
| intBuf2 = nvtcs + 2 * nprocs; |
| intBuf3 = nvtcs + 3 * nprocs; |
| intBuf4 = nvtcs + 4 * nprocs; |
| memAux += 5 * nprocs * sizeof(int); |
| maxszsn = sp_ienv_dist(3); |
| /* Allocate space for storing Glu_persist_n. */ |
| if ( !(supno_n = intMalloc_dist(n+1)) ) { |
| fprintf (stderr, "Malloc fails for supno_n[]."); |
| return (memAux); |
| } |
| memRet += (float) ((n+1) * sizeof(int_t)); |
| /* ------------------------------------------------------------ |
| DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION |
| ------------------------------------------------------------*/ |
| if (nvtcs_loc > INT_MAX) |
| ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n"); |
| intNvtcs_loc = (int) nvtcs_loc; |
| MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT, |
| 0, grid->comm); |
| if (!iam) { |
| /* set ptrToRecv to point to the beginning of the data for |
| each processor */ |
| for (k = 0, p = 0; p < nprocs; p++) { |
| ptrToRecv[p] = k; |
| k += nvtcs[p]; |
| } |
| } |
| if (nprocs > 1) { |
| temp = NULL; |
| if (!iam ) { |
| if ( !(temp = intMalloc_dist (n+1)) ) { |
| fprintf (stderr, "Malloc fails for temp[]."); |
| return (memAux + memRet); |
| } |
| memAux += (float) (n+1) * iword; |
| } |
| #if defined (_LONGINT) |
| for (p=0; p<nprocs; p++) { |
| if (ptrToRecv[p] > INT_MAX) |
| ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); |
| intBuf1[p] = (int) ptrToRecv[p]; |
| } |
| #else /* Default */ |
| intBuf1 = ptrToRecv; |
| #endif |
| MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, |
| temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm); |
| } |
| else |
| temp = supno_s; |
| if (!iam) { |
| nsupers = 0; |
| p = (int) OWNER( globToLoc[0] ); |
| gb = temp[ptrToRecv[p]]; |
| supno_n[0] = nsupers; |
| ptrToRecv[p] ++; |
| szsn = 1; |
| for (j = 1; j < n; j ++) { |
| if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) { |
| nsupers ++; |
| p = (int) OWNER( globToLoc[j] ); |
| gb = temp[ptrToRecv[p]]; |
| szsn = 1; |
| } |
| else { |
| szsn ++; |
| } |
| ptrToRecv[p] ++; |
| supno_n[j] = nsupers; |
| } |
| nsupers++; |
| if (nprocs > 1) { |
| SUPERLU_FREE (temp); |
| memAux -= (float) (n+1) * iword; |
| } |
| supno_n[n] = nsupers; |
| } |
| /* reset to 0 nnzToSend */ |
| for (p = 0; p < 2 *nprocs; p++) |
| nnzToSend[p] = 0; |
| MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm); |
| nsupers = supno_n[n]; |
| /* Allocate space for storing Glu_persist_n. */ |
| if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) { |
| fprintf (stderr, "Malloc fails for xsup_n[]."); |
| return (memAux + memRet); |
| } |
| memRet += (float) (nsupers+1) * iword; |
| /* ------------------------------------------------------------ |
| COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, |
| THEN ALLOCATE SPACE. |
| THIS ACCOUNTS FOR THE FIRST PASS OF L and U. |
| ------------------------------------------------------------*/ |
| gb = EMPTY; |
| for (i = 0; i < n; i++) { |
| if (gb != supno_n[i]) { |
| /* a new supernode starts */ |
| gb = supno_n[i]; |
| xsup_n[gb] = i; |
| } |
| } |
| xsup_n[nsupers] = n; |
| for (p = 0; p < nprocs; p++) { |
| send_1[p] = FALSE; |
| send_2[p] = FALSE; |
| } |
| for (gb_n = 0; gb_n < nsupers; gb_n ++) { |
| i = xsup_n[gb_n]; |
| if (iam == (int) OWNER( globToLoc[i] )) { |
| pc = PCOL( gb_n, grid ); |
| pr = PROW( gb_n, grid ); |
| p_diag = PNUM( pr, pc, grid); |
| i_loc = LOCAL_IND( globToLoc[i] ); |
| gb_s = supno_s[i_loc]; |
| fst_s = xsup_beg_s[gb_s]; |
| lst_s = xsup_end_s[gb_s]; |
| fst_s_l = LOCAL_IND( globToLoc[fst_s] ); |
| for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) { |
| k = lsub_s[j]; |
| if (k >= i) { |
| gb = supno_n[k]; |
| p = (int) PNUM( PROW(gb, grid), pc, grid ); |
| nnzToSend[2*p] ++; |
| send_1[p] = TRUE; |
| } |
| } |
| for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) { |
| k = usub_s[j]; |
| if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) { |
| gb = supno_n[k]; |
| p = PNUM( pr, PCOL(gb, grid), grid); |
| nnzToSend[2*p+1] ++; |
| send_2[p] = TRUE; |
| } |
| } |
| nsend_2 = 0; |
| for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { |
| nnzToSend[2*p+1] += 2; |
| if (send_2[p]) nsend_2 ++; |
| } |
| for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) |
| if (send_2[p] || p == p_diag) { |
| if (p == p_diag && !send_2[p]) |
| nnzToSend[2*p+1] += nsend_2; |
| else |
| nnzToSend[2*p+1] += nsend_2-1; |
| send_2[p] = FALSE; |
| } |
| nsend_1 = 0; |
| for (p = pc; p < nprocs; p += grid->npcol) { |
| nnzToSend[2*p] += 2; |
| if (send_1[p]) nsend_1 ++; |
| } |
| for (p = pc; p < nprocs; p += grid->npcol) |
| if (send_1[p]) { |
| nnzToSend[2*p] += nsend_1-1; |
| send_1[p] = FALSE; |
| } |
| else |
| nnzToSend[2*p] += nsend_1; |
| } |
| } |
| /* All-to-all communication */ |
| MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t, |
| grid->comm); |
| nnz_loc_l = nnz_loc_u = 0; |
| SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0; |
| for (p = 0; p < nprocs; p++) { |
| if ( p != iam ) { |
| SendCnt_l += nnzToSend[2*p]; nnzToSend_l[p] = nnzToSend[2*p]; |
| SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; |
| RecvCnt_l += nnzToRecv[2*p]; nnzToRecv_l[p] = nnzToRecv[2*p]; |
| RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1]; |
| } else { |
| nnz_loc_l += nnzToRecv[2*p]; |
| nnz_loc_u += nnzToRecv[2*p+1]; |
| nnzToSend_l[p] = 0; nnzToSend_u[p] = 0; |
| nnzToRecv_l[p] = nnzToRecv[2*p]; |
| nnzToRecv_u[p] = nnzToRecv[2*p+1]; |
| } |
| } |
| /* Allocate space for storing the symbolic structure after redistribution. */ |
| nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ |
| nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ |
| if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) { |
| fprintf (stderr, "Malloc fails for xlsub_n[]."); |
| return (memAux + memRet); |
| } |
| memRet += (float) (nsupers_j+1) * iword; |
| if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) { |
| fprintf (stderr, "Malloc fails for xusub_n[]."); |
| return (memAux + memRet); |
| } |
| memRet += (float) (nsupers_i+1) * iword; |
| /* Allocate temp storage for sending/receiving the L/U symbolic structure. */ |
| if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) { |
| if (!(rcv_luind = |
| intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) { |
| fprintf (stderr, "Malloc fails for rcv_luind[]."); |
| return (memAux + memRet); |
| } |
| memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) |
| * iword; |
| } |
| if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { |
| if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) { |
| fprintf (stderr, "Malloc fails for index[]."); |
| return (memAux + memRet); |
| } |
| memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; |
| } |
| /* ------------------------------------------------------------------ |
| LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND. |
| THIS ACCOUNTS FOR THE SECOND PASS OF L and U. |
| ------------------------------------------------------------------*/ |
| sendL = TRUE; |
| sendU = FALSE; |
| while (sendL || sendU) { |
| if (sendL) { |
| xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n; |
| nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l; |
| } |
| if (sendU) { |
| xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n; |
| nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u; |
| } |
| for (i = 0, j = 0, p = 0; p < nprocs; p++) { |
| if ( p != iam ) { |
| ptrToSend[p] = i; i += nnzToSend[p]; |
| } |
| ptrToRecv[p] = j; j += nnzToRecv[p]; |
| } |
| nnzToRecv[iam] = 0; |
| ind_loc = ptrToRecv[iam]; |
| for (gb_n = 0; gb_n < nsupers; gb_n++) { |
| nsend_2 = 0; |
| i = xsup_n[gb_n]; |
| if (iam == OWNER( globToLoc[i] )) { |
| pc = PCOL( gb_n, grid ); |
| pr = PROW( gb_n, grid ); |
| p_diag = PNUM( pr, pc, grid ); |
| i_loc = LOCAL_IND( globToLoc[i] ); |
| gb_s = supno_s[i_loc]; |
| fst_s = xsup_beg_s[gb_s]; |
| lst_s = xsup_end_s[gb_s]; |
| fst_s_l = LOCAL_IND( globToLoc[fst_s] ); |
| if (sendL) { |
| p = pc; np = grid->nprow; |
| } else { |
| p = pr * grid->npcol; np = grid->npcol; |
| } |
| for (j = 0; j < np; j++) { |
| if (p == iam) { |
| rcv_luind[ind_loc] = gb_n; |
| rcv_luind[ind_loc+1] = 0; |
| tmp_ptrToSend[p] = ind_loc + 1; |
| ind_loc += 2; |
| } |
| else { |
| snd_luind[ptrToSend[p]] = gb_n; |
| snd_luind[ptrToSend[p]+1] = 0; |
| tmp_ptrToSend[p] = ptrToSend[p] + 1; |
| ptrToSend[p] += 2; |
| } |
| if (sendL) p += grid->npcol; |
| if (sendU) p++; |
| } |
| for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) { |
| k = sub_s[j]; |
| if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) { |
| gb = supno_n[k]; |
| if (sendL) |
| p = PNUM( PROW(gb, grid), pc, grid ); |
| else |
| p = PNUM( pr, PCOL(gb, grid), grid); |
| if (send_1[p] == FALSE) { |
| send_1[p] = TRUE; |
| send_2[nsend_2] = k; nsend_2 ++; |
| } |
| if (p == iam) { |
| rcv_luind[ind_loc] = k; ind_loc++; |
| if (sendL) |
| xsub_n[LBj( gb_n, grid )] ++; |
| else |
| xsub_n[LBi( gb_n, grid )] ++; |
| } |
| else { |
| snd_luind[ptrToSend[p]] = k; |
| ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; |
| } |
| } |
| } |
| if (sendL) |
| for (p = pc; p < nprocs; p += grid->npcol) { |
| for (k = 0; k < nsend_2; k++) { |
| gb = supno_n[send_2[k]]; |
| if (PNUM(PROW(gb, grid), pc, grid) != p) { |
| if (p == iam) { |
| rcv_luind[ind_loc] = send_2[k]; ind_loc++; |
| xsub_n[LBj( gb_n, grid )] ++; |
| } |
| else { |
| snd_luind[ptrToSend[p]] = send_2[k]; |
| ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; |
| } |
| } |
| } |
| send_1[p] = FALSE; |
| } |
| if (sendU) |
| for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { |
| if (send_1[p] || p == p_diag) { |
| for (k = 0; k < nsend_2; k++) { |
| gb = supno_n[send_2[k]]; |
| if(PNUM( pr, PCOL(gb, grid), grid) != p) { |
| if (p == iam) { |
| rcv_luind[ind_loc] = send_2[k]; ind_loc++; |
| xsub_n[LBi( gb_n, grid )] ++; |
| } |
| else { |
| snd_luind[ptrToSend[p]] = send_2[k]; |
| ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; |
| } |
| } |
| } |
| send_1[p] = FALSE; |
| } |
| } |
| } |
| } |
| /* reset ptrToSnd to point to the beginning of the data for |
| each processor (structure needed in MPI_Alltoallv) */ |
| for (i = 0, p = 0; p < nprocs; p++) { |
| ptrToSend[p] = i; i += nnzToSend[p]; |
| } |
| /* ------------------------------------------------------------ |
| PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. |
| Note: it uses MPI_Alltoallv. |
| ------------------------------------------------------------*/ |
| if (nprocs > 1) { |
| #if defined (_LONGINT) |
| nnzToSend[iam] = 0; |
| for (p=0; p<nprocs; p++) { |
| if (nnzToSend[p] > INT_MAX || ptrToSend[p] > INT_MAX || |
| nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX) |
| ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); |
| intBuf1[p] = (int) nnzToSend[p]; |
| intBuf2[p] = (int) ptrToSend[p]; |
| intBuf3[p] = (int) nnzToRecv[p]; |
| intBuf4[p] = (int) ptrToRecv[p]; |
| } |
| #else /* Default */ |
| intBuf1 = nnzToSend; intBuf2 = ptrToSend; |
| intBuf3 = nnzToRecv; intBuf4 = ptrToRecv; |
| #endif |
| MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, |
| rcv_luind, intBuf3, intBuf4, mpi_int_t, |
| grid->comm); |
| } |
| if (sendL) |
| nnzToRecv[iam] = nnz_loc_l; |
| else |
| nnzToRecv[iam] = nnz_loc_u; |
| /* ------------------------------------------------------------ |
| DEALLOCATE TEMPORARY STORAGE. |
| -------------------------------------------------------------*/ |
| if (sendU) |
| if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { |
| SUPERLU_FREE (snd_luind); |
| memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; |
| } |
| /* ------------------------------------------------------------ |
| CONVERT THE FORMAT. |
| ------------------------------------------------------------*/ |
| /* Initialize the array of column of L/ row of U pointers */ |
| k = 0; |
| for (p = 0; p < nprocs; p ++) { |
| if (p != iam) { |
| i = k; |
| while (i < k + nnzToRecv[p]) { |
| gb = rcv_luind[i]; |
| nelts = rcv_luind[i+1]; |
| if (sendL) |
| xsub_n[LBj( gb, grid )] = nelts; |
| else |
| xsub_n[LBi( gb, grid )] = nelts; |
| i += nelts + 2; |
| } |
| } |
| k += nnzToRecv[p]; |
| } |
| if (sendL) j = nsupers_j; |
| else j = nsupers_i; |
| k = 0; |
| isize = xsub_n[0]; |
| xsub_n[0] = 0; |
| for (gb_l = 1; gb_l < j; gb_l++) { |
| k += isize; |
| isize = xsub_n[gb_l]; |
| xsub_n[gb_l] = k; |
| } |
| xsub_n[gb_l] = k + isize; |
| nnz_loc = xsub_n[gb_l]; |
| if (sendL) { |
| lsub_n = NULL; |
| if (nnz_loc) { |
| if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) { |
| fprintf (stderr, "Malloc fails for lsub_n[]."); |
| return (memAux + memRet); |
| } |
| memRet += (float) (nnz_loc * iword); |
| } |
| sub_n = lsub_n; |
| } |
| if (sendU) { |
| usub_n = NULL; |
| if (nnz_loc) { |
| if ( !(usub_n = intMalloc_dist(nnz_loc)) ) { |
| fprintf (stderr, "Malloc fails for usub_n[]."); |
| return (memAux + memRet); |
| } |
| memRet += (float) (nnz_loc * iword); |
| } |
| sub_n = usub_n; |
| } |
| /* Copy the data into the L column / U row oriented storage */ |
| k = 0; |
| for (p = 0; p < nprocs; p++) { |
| i = k; |
| while (i < k + nnzToRecv[p]) { |
| gb = rcv_luind[i]; |
| if (gb >= nsupers) |
| printf ("Pe[%d] p %d gb %d nsupers %d i %d i-k %d\n", |
| iam, p, gb, nsupers, i, i-k); |
| i += 2; |
| if (sendL) gb_l = LBj( gb, grid ); |
| if (sendU) gb_l = LBi( gb, grid ); |
| for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) { |
| sub_n[j] = rcv_luind[i]; |
| } |
| } |
| k += nnzToRecv[p]; |
| } |
| if (sendL) { |
| sendL = FALSE; sendU = TRUE; |
| } |
| else |
| sendU = FALSE; |
| } |
| /* deallocate memory allocated during symbolic factorization routine */ |
| if (rcv_luind != NULL) { |
| SUPERLU_FREE (rcv_luind); |
| memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword; |
| } |
| SUPERLU_FREE (mem); |
| memAux -= (float) (12 * nprocs * iword); |
| SUPERLU_FREE(nvtcs); |
| memAux -= (float) (5 * nprocs * sizeof(int)); |
| if (xlsub_s != NULL) { |
| SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s); |
| } |
| if (xusub_s != NULL) { |
| SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s); |
| } |
| SUPERLU_FREE (globToLoc); |
| if (supno_s != NULL) { |
| SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s); |
| SUPERLU_FREE (supno_s); |
| } |
| Glu_persist->supno = supno_n; Glu_persist->xsup = xsup_n; |
| *p_xlsub = xlsub_n; *p_lsub = lsub_n; |
| *p_xusub = xusub_n; *p_usub = usub_n; |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Exit dist_symbLU()"); |
| #endif |
| return (-memRet); |
| } |
| /*! \brief |
| * |
| * <pre> |
| * Purpose |
| * ======= |
| * Re-distribute A on the 2D process mesh. The lower part is |
| * stored using a column format and the upper part |
| * is stored using a row format. |
| * |
| * Arguments |
| * ========= |
| * |
| * A (Input) SuperMatrix* |
| * The distributed input matrix A of dimension (A->nrow, A->ncol). |
| * The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. |
| * |
| * ScalePermstruct (Input) ScalePermstruct_t* |
| * The data structure to store the scaling and permutation vectors |
| * describing the transformations performed to the original matrix A. |
| * |
| * Glu_persist (Input) Glu_persist_t * |
| * Information on supernodes mapping. |
| * |
| * grid (Input) gridinfo_t* |
| * The 2D process mesh. |
| * |
| * p_ainf_colptr (Output) int_t** |
| * Pointer to the lower part of A distributed on a 2D grid |
| * of processors, stored by columns. |
| * |
| * p_ainf_rowind (Output) int_t** |
| * Structure of of the lower part of A distributed on a |
| * 2D grid of processors, stored by columns. |
| * |
| * p_ainf_val (Output) double** |
| * Numerical values of the lower part of A, distributed on a |
| * 2D grid of processors, stored by columns. |
| * |
| * p_asup_rowptr (Output) int_t** |
| * Pointer to the upper part of A distributed on a 2D grid |
| * of processors, stored by rows. |
| * |
| * p_asup_colind (Output) int_t** |
| * Structure of of the upper part of A distributed on a |
| * 2D grid of processors, stored by rows. |
| * |
| * p_asup_val (Output) double** |
| * Numerical values of the upper part of A, distributed on a |
| * 2D grid of processors, stored by rows. |
| * |
| * ilsum_i (Input) int_t * |
| * Starting position of each supernode in |
| * the full array (local, block row wise). |
| * |
| * ilsum_j (Input) int_t * |
| * Starting position of each supernode in |
| * the full array (local, block column wise). |
| * |
| * Return value |
| * ============ |
| * < 0, number of bytes allocated on return from the dist_symbLU |
| * > 0, number of bytes allocated when out of memory. |
| * (an approximation). |
| * </pre> |
| */ |
| static float |
| ddist_A(SuperMatrix *A, ScalePermstruct_t *ScalePermstruct, |
| Glu_persist_t *Glu_persist, gridinfo_t *grid, |
| int_t **p_ainf_colptr, int_t **p_ainf_rowind, double **p_ainf_val, |
| int_t **p_asup_rowptr, int_t **p_asup_colind, double **p_asup_val, |
| int_t *ilsum_i, int_t *ilsum_j |
| ) |
| { |
| int iam, p, procs; |
| NRformat_loc *Astore; |
| int_t *perm_r; /* row permutation vector */ |
| int_t *perm_c; /* column permutation vector */ |
| int_t i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize; |
| int_t nsupers, nsupers_i, nsupers_j; |
| int_t nnz_loc, nnz_loc_ainf, nnz_loc_asup; /* number of local nonzeros */ |
| int_t nnz_remote; /* number of remote nonzeros to be sent */ |
| int_t SendCnt; /* number of remote nonzeros to be sent */ |
| int_t RecvCnt; /* number of remote nonzeros to be sent */ |
| int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; |
| double *asup_val, *ainf_val; |
| int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; |
| int_t *ia, *ja, **ia_send, *index, *itemp; |
| int_t *ptr_to_send; |
| double *aij, **aij_send, *nzval, *dtemp; |
| double *nzval_a; |
| MPI_Request *send_req; |
| MPI_Status status; |
| int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ |
| int_t *supno = Glu_persist->supno; |
| float memAux; /* Memory used during this routine and freed on return */ |
| float memRet; /* Memory allocated and not freed on return */ |
| int_t iword, dword, szbuf; |
| /* ------------------------------------------------------------ |
| INITIALIZATION. |
| ------------------------------------------------------------*/ |
| iam = grid->iam; |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Enter ddist_A()"); |
| #endif |
| iword = sizeof(int_t); |
| dword = sizeof(double); |
| perm_r = ScalePermstruct->perm_r; |
| perm_c = ScalePermstruct->perm_c; |
| procs = grid->nprow * grid->npcol; |
| Astore = (NRformat_loc *) A->Store; |
| n = A->ncol; |
| m_loc = Astore->m_loc; |
| fst_row = Astore->fst_row; |
| if (!(nnzToRecv = intCalloc_dist(2*procs))) { |
| fprintf (stderr, "Malloc fails for nnzToRecv[]."); |
| return (ERROR_RET); |
| } |
| memAux = (float) (2 * procs * iword); |
| memRet = 0.; |
| nnzToSend = nnzToRecv + procs; |
| nsupers = supno[n-1] + 1; |
| /* ------------------------------------------------------------ |
| COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, |
| THEN ALLOCATE SPACE. |
| THIS ACCOUNTS FOR THE FIRST PASS OF A. |
| ------------------------------------------------------------*/ |
| for (i = 0; i < m_loc; ++i) { |
| for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { |
| irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ |
| jcol = Astore->colind[j]; |
| gbi = BlockNum( irow ); |
| gbj = BlockNum( jcol ); |
| p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); |
| ++nnzToSend[p]; |
| } |
| } |
| /* All-to-all communication */ |
| MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, |
| grid->comm); |
| maxnnzToRecv = 0; |
| nnz_loc = SendCnt = RecvCnt = 0; |
| for (p = 0; p < procs; ++p) { |
| if ( p != iam ) { |
| SendCnt += nnzToSend[p]; |
| RecvCnt += nnzToRecv[p]; |
| maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); |
| } else { |
| nnz_loc += nnzToRecv[p]; |
| /*assert(nnzToSend[p] == nnzToRecv[p]);*/ |
| } |
| } |
| k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ |
| szbuf = k; |
| /* Allocate space for storing the triplets after redistribution. */ |
| if ( !(ia = intMalloc_dist(2*k)) ) { |
| fprintf (stderr, "Malloc fails for ia[]."); |
| return (memAux); |
| } |
| memAux += (float) (2*k*iword); |
| ja = ia + k; |
| if ( !(aij = doubleMalloc_dist(k)) ) { |
| fprintf (stderr, "Malloc fails for aij[]."); |
| return (memAux); |
| } |
| memAux += (float) (k*dword); |
| /* Allocate temporary storage for sending/receiving the A triplets. */ |
| if ( procs > 1 ) { |
| if ( !(send_req = (MPI_Request *) |
| SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) { |
| fprintf (stderr, "Malloc fails for send_req[]."); |
| return (memAux); |
| } |
| memAux += (float) (2*procs *sizeof(MPI_Request)); |
| if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) { |
| fprintf(stderr, "Malloc fails for ia_send[]."); |
| return (memAux); |
| } |
| memAux += (float) (procs*sizeof(int_t*)); |
| if ( !(aij_send = (double **)SUPERLU_MALLOC(procs*sizeof(double*))) ) { |
| fprintf(stderr, "Malloc fails for aij_send[]."); |
| return (memAux); |
| } |
| memAux += (float) (procs*sizeof(double*)); |
| if ( !(index = intMalloc_dist(2*SendCnt)) ) { |
| fprintf(stderr, "Malloc fails for index[]."); |
| return (memAux); |
| } |
| memAux += (float) (2*SendCnt*iword); |
| if ( !(nzval = doubleMalloc_dist(SendCnt)) ) { |
| fprintf(stderr, "Malloc fails for nzval[]."); |
| return (memAux); |
| } |
| memAux += (float) (SendCnt * dword); |
| if ( !(ptr_to_send = intCalloc_dist(procs)) ) { |
| fprintf(stderr, "Malloc fails for ptr_to_send[]."); |
| return (memAux); |
| } |
| memAux += (float) (procs * iword); |
| if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) { |
| fprintf(stderr, "Malloc fails for itemp[]."); |
| return (memAux); |
| } |
| memAux += (float) (2*maxnnzToRecv*iword); |
| if ( !(dtemp = doubleMalloc_dist(maxnnzToRecv)) ) { |
| fprintf(stderr, "Malloc fails for dtemp[]."); |
| return (memAux); |
| } |
| memAux += (float) (maxnnzToRecv * dword); |
| for (i = 0, j = 0, p = 0; p < procs; ++p) { |
| if ( p != iam ) { |
| ia_send[p] = &index[i]; |
| i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ |
| aij_send[p] = &nzval[j]; |
| j += nnzToSend[p]; |
| } |
| } |
| } /* if procs > 1 */ |
| nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ |
| nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ |
| if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) { |
| fprintf (stderr, "Malloc fails for *ainf_colptr[]."); |
| return (memAux); |
| } |
| memRet += (float) (ilsum_j[nsupers_j] + 1) * iword; |
| if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) { |
| fprintf (stderr, "Malloc fails for *asup_rowptr[]."); |
| return (memAux+memRet); |
| } |
| memRet += (float) (ilsum_i[nsupers_i] + 1) * iword; |
| /* ------------------------------------------------------------ |
| LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. |
| THIS ACCOUNTS FOR THE SECOND PASS OF A. |
| ------------------------------------------------------------*/ |
| nnz_loc = 0; /* Reset the local nonzero count. */ |
| nnz_loc_ainf = nnz_loc_asup = 0; |
| nzval_a = Astore->nzval; |
| for (i = 0; i < m_loc; ++i) { |
| for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { |
| irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ |
| jcol = Astore->colind[j]; |
| gbi = BlockNum( irow ); |
| gbj = BlockNum( jcol ); |
| p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); |
| if ( p != iam ) { /* remote */ |
| k = ptr_to_send[p]; |
| ia_send[p][k] = irow; |
| ia_send[p][k + nnzToSend[p]] = jcol; |
| aij_send[p][k] = nzval_a[j]; |
| ++ptr_to_send[p]; |
| } else { /* local */ |
| ia[nnz_loc] = irow; |
| ja[nnz_loc] = jcol; |
| aij[nnz_loc] = nzval_a[j]; |
| ++nnz_loc; |
| /* Count nonzeros in each column of L / row of U */ |
| if (gbi >= gbj) { |
| ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; |
| nnz_loc_ainf ++; |
| } |
| else { |
| asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; |
| nnz_loc_asup ++; |
| } |
| } |
| } |
| } |
| /* ------------------------------------------------------------ |
| PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. |
| NOTE: Can possibly use MPI_Alltoallv. |
| ------------------------------------------------------------*/ |
| for (p = 0; p < procs; ++p) { |
| if ( p != iam ) { |
| it = 2*nnzToSend[p]; |
| MPI_Isend( ia_send[p], it, mpi_int_t, |
| p, iam, grid->comm, &send_req[p] ); |
| it = nnzToSend[p]; |
| MPI_Isend( aij_send[p], it, MPI_DOUBLE, |
| p, iam+procs, grid->comm, &send_req[procs+p] ); |
| } |
| } |
| for (p = 0; p < procs; ++p) { |
| if ( p != iam ) { |
| it = 2*nnzToRecv[p]; |
| MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); |
| it = nnzToRecv[p]; |
| MPI_Recv( dtemp, it, MPI_DOUBLE, p, p+procs, |
| grid->comm, &status ); |
| for (i = 0; i < nnzToRecv[p]; ++i) { |
| ia[nnz_loc] = itemp[i]; |
| irow = itemp[i]; |
| jcol = itemp[i + nnzToRecv[p]]; |
| /* assert(jcol<n); */ |
| ja[nnz_loc] = jcol; |
| aij[nnz_loc] = dtemp[i]; |
| ++nnz_loc; |
| gbi = BlockNum( irow ); |
| gbj = BlockNum( jcol ); |
| /* Count nonzeros in each column of L / row of U */ |
| if (gbi >= gbj) { |
| ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; |
| nnz_loc_ainf ++; |
| } |
| else { |
| asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; |
| nnz_loc_asup ++; |
| } |
| } |
| } |
| } |
| for (p = 0; p < procs; ++p) { |
| if ( p != iam ) { |
| MPI_Wait( &send_req[p], &status); |
| MPI_Wait( &send_req[procs+p], &status); |
| } |
| } |
| /* ------------------------------------------------------------ |
| DEALLOCATE TEMPORARY STORAGE |
| ------------------------------------------------------------*/ |
| SUPERLU_FREE(nnzToRecv); |
| memAux -= 2 * procs * iword; |
| if ( procs > 1 ) { |
| SUPERLU_FREE(send_req); |
| SUPERLU_FREE(ia_send); |
| SUPERLU_FREE(aij_send); |
| SUPERLU_FREE(index); |
| SUPERLU_FREE(nzval); |
| SUPERLU_FREE(ptr_to_send); |
| SUPERLU_FREE(itemp); |
| SUPERLU_FREE(dtemp); |
| memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) + |
| procs*sizeof(double*) + 2*SendCnt * iword + |
| SendCnt* dword + procs*iword + |
| 2*maxnnzToRecv*iword + maxnnzToRecv*dword; |
| } |
| /* ------------------------------------------------------------ |
| CONVERT THE TRIPLET FORMAT. |
| ------------------------------------------------------------*/ |
| if (nnz_loc_ainf != 0) { |
| if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) { |
| fprintf (stderr, "Malloc fails for *ainf_rowind[]."); |
| return (memAux+memRet); |
| } |
| memRet += (float) (nnz_loc_ainf * iword); |
| if ( !(ainf_val = doubleMalloc_dist(nnz_loc_ainf)) ) { |
| fprintf (stderr, "Malloc fails for *ainf_val[]."); |
| return (memAux+memRet); |
| } |
| memRet += (float) (nnz_loc_ainf * dword); |
| } |
| else { |
| ainf_rowind = NULL; |
| ainf_val = NULL; |
| } |
| if (nnz_loc_asup != 0) { |
| if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) { |
| fprintf (stderr, "Malloc fails for *asup_colind[]."); |
| return (memAux + memRet); |
| } |
| memRet += (float) (nnz_loc_asup * iword); |
| if ( !(asup_val = doubleMalloc_dist(nnz_loc_asup)) ) { |
| fprintf (stderr, "Malloc fails for *asup_val[]."); |
| return (memAux + memRet); |
| } |
| memRet += (float) (nnz_loc_asup * dword); |
| } |
| else { |
| asup_colind = NULL; |
| asup_val = NULL; |
| } |
| /* Initialize the array of column pointers */ |
| k = 0; |
| jsize = ainf_colptr[0]; ainf_colptr[0] = 0; |
| for (j = 1; j < ilsum_j[nsupers_j]; j++) { |
| k += jsize; |
| jsize = ainf_colptr[j]; |
| ainf_colptr[j] = k; |
| } |
| ainf_colptr[ilsum_j[nsupers_j]] = k + jsize; |
| i = 0; |
| isize = asup_rowptr[0]; asup_rowptr[0] = 0; |
| for (j = 1; j < ilsum_i[nsupers_i]; j++) { |
| i += isize; |
| isize = asup_rowptr[j]; |
| asup_rowptr[j] = i; |
| } |
| asup_rowptr[ilsum_i[nsupers_i]] = i + isize; |
| /* Copy the triplets into the column oriented storage */ |
| for (i = 0; i < nnz_loc; ++i) { |
| jcol = ja[i]; |
| irow = ia[i]; |
| gbi = BlockNum( irow ); |
| gbj = BlockNum( jcol ); |
| /* Count nonzeros in each column of L / row of U */ |
| if (gbi >= gbj) { |
| j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj ); |
| k = ainf_colptr[j]; |
| ainf_rowind[k] = irow; |
| ainf_val[k] = aij[i]; |
| ainf_colptr[j] ++; |
| } |
| else { |
| j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi ); |
| k = asup_rowptr[j]; |
| asup_colind[k] = jcol; |
| asup_val[k] = aij[i]; |
| asup_rowptr[j] ++; |
| } |
| } |
| /* Reset the column pointers to the beginning of each column */ |
| for (j = ilsum_j[nsupers_j]; j > 0; j--) |
| ainf_colptr[j] = ainf_colptr[j-1]; |
| for (j = ilsum_i[nsupers_i]; j > 0; j--) |
| asup_rowptr[j] = asup_rowptr[j-1]; |
| ainf_colptr[0] = 0; |
| asup_rowptr[0] = 0; |
| SUPERLU_FREE(ia); |
| SUPERLU_FREE(aij); |
| memAux -= 2*szbuf*iword + szbuf*dword; |
| *p_ainf_colptr = ainf_colptr; |
| *p_ainf_rowind = ainf_rowind; |
| *p_ainf_val = ainf_val; |
| *p_asup_rowptr = asup_rowptr; |
| *p_asup_colind = asup_colind; |
| *p_asup_val = asup_val; |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Exit ddist_A()"); |
| fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6); |
| #endif |
| return (-memRet); |
| } /* dist_A */ |
| /*! \brief |
| * |
| * <pre> |
| * Purpose |
| * ======= |
| * Distribute the input matrix onto the 2D process mesh. |
| * |
| * Arguments |
| * ========= |
| * |
| * fact (input) fact_t |
| * Specifies whether or not the L and U structures will be re-used. |
| * = SamePattern_SameRowPerm: L and U structures are input, and |
| * unchanged on exit. |
| * This routine should not be called for this case, an error |
| * is generated. Instead, pddistribute routine should be called. |
| * = DOFACT or SamePattern: L and U structures are computed and output. |
| * |
| * n (Input) int |
| * Dimension of the matrix. |
| * |
| * A (Input) SuperMatrix* |
| * The distributed input matrix A of dimension (A->nrow, A->ncol). |
| * A may be overwritten by diag(R)*A*diag(C)*Pc^T. |
| * The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE. |
| * |
| * ScalePermstruct (Input) ScalePermstruct_t* |
| * The data structure to store the scaling and permutation vectors |
| * describing the transformations performed to the original matrix A. |
| * |
| * Glu_freeable (Input) *Glu_freeable_t |
| * The global structure describing the graph of L and U. |
| * |
| * LUstruct (Input) LUstruct_t* |
| * Data structures for L and U factors. |
| * |
| * grid (Input) gridinfo_t* |
| * The 2D process mesh. |
| * |
| * Return value |
| * ============ |
| * < 0, number of bytes allocated on return from the dist_symbLU |
| * > 0, number of bytes allocated for performing the distribution |
| * of the data, when out of memory. |
| * (an approximation). |
| * </pre> |
| */ |
| float |
| ddist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, |
| ScalePermstruct_t *ScalePermstruct, |
| Pslu_freeable_t *Pslu_freeable, |
| LUstruct_t *LUstruct, gridinfo_t *grid) |
| { |
| Glu_persist_t *Glu_persist = LUstruct->Glu_persist; |
| Glu_freeable_t Glu_freeable_n; |
| LocalLU_t *Llu = LUstruct->Llu; |
| int_t bnnz, fsupc, i, irow, istart, j, jb, jj, k, |
| len, len1, nsupc, nsupc_gb, ii, nprocs; |
| int_t ljb; /* local block column number */ |
| int_t nrbl; /* number of L blocks in current block column */ |
| int_t nrbu; /* number of U blocks in current block column */ |
| int_t gb; /* global block number; 0 < gb <= nsuper */ |
| int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ |
| int iam, jbrow, jbcol, jcol, kcol, mycol, myrow, pc, pr, ljb_i, ljb_j, p; |
| int_t mybufmax[NBUFFERS]; |
| NRformat_loc *Astore; |
| double *a; |
| int_t *asub, *xa; |
| int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; |
| double *asup_val, *ainf_val; |
| int_t *xsup, *supno; /* supernode and column mapping */ |
| int_t *lsub, *xlsub, *usub, *xusub; |
| int_t nsupers, nsupers_i, nsupers_j, nsupers_ij; |
| int_t next_ind; /* next available position in index[*] */ |
| int_t next_val; /* next available position in nzval[*] */ |
| int_t *index; /* indices consist of headers and row subscripts */ |
| double *lusup, *uval; /* nonzero values in L and U */ |
| int_t *recvBuf; |
| int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend; |
| double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ |
| int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ |
| double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ |
| int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ |
| /*-- Counts to be used in factorization. --*/ |
| int_t *ToRecv, *ToSendD, **ToSendR; |
| /*-- Counts to be used in lower triangular solve. --*/ |
| int_t *fmod; /* Modification count for L-solve. */ |
| int_t **fsendx_plist; /* Column process list to send down Xk. */ |
| int_t nfrecvx = 0; /* Number of Xk I will receive. */ |
| int_t nfsendx = 0; /* Number of Xk I will send */ |
| int_t kseen; |
| /*-- Counts to be used in upper triangular solve. --*/ |
| int_t *bmod; /* Modification count for U-solve. */ |
| int_t **bsendx_plist; /* Column process list to send down Xk. */ |
| int_t nbrecvx = 0; /* Number of Xk I will receive. */ |
| int_t nbsendx = 0; /* Number of Xk I will send */ |
| int_t *ilsum; /* starting position of each supernode in |
| the full array (local) */ |
| int_t *ilsum_j, ldaspa_j; /* starting position of each supernode in |
| the full array (local, block column wise) */ |
| /*-- Auxiliary arrays; freed on return --*/ |
| int_t *Urb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ |
| int_t *LUb_length; /* L,U block length; size nsupers_ij */ |
| int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */ |
| int_t *LUb_number; /* global block number; size nsupers_ij */ |
| int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc) */ |
| int_t *Lrb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ |
| double *dense, *dense_col; /* SPA */ |
| double zero = 0.0; |
| int_t ldaspa; /* LDA of SPA */ |
| int_t iword, dword; |
| float memStrLU, memA, |
| memDist = 0.; /* memory used for redistributing the data, which does |
| not include the memory for the numerical values |
| of L and U (positive number)*/ |
| float memNLU = 0.; /* memory allocated for storing the numerical values of |
| L and U, that will be used in the numeric |
| factorization (positive number) */ |
| #if ( PRNTlevel>=1 ) |
| int_t nLblocks = 0, nUblocks = 0; |
| #endif |
| /* Initialization. */ |
| iam = grid->iam; |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Enter dist_psymbtonum()"); |
| #endif |
| myrow = MYROW( iam, grid ); |
| mycol = MYCOL( iam, grid ); |
| nprocs = grid->npcol * grid->nprow; |
| for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; |
| Astore = (NRformat_loc *) A->Store; |
| iword = sizeof(int_t); |
| dword = sizeof(double); |
| if (fact == SamePattern_SameRowPerm) { |
| ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm."); |
| } |
| if ((memStrLU = |
| dist_symbLU (n, Pslu_freeable, |
| Glu_persist, &xlsub, &lsub, &xusub, &usub, grid)) > 0) |
| return (memStrLU); |
| memDist += (-memStrLU); |
| xsup = Glu_persist->xsup; /* supernode and column mapping */ |
| supno = Glu_persist->supno; |
| nsupers = supno[n-1] + 1; |
| nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */ |
| nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */ |
| nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j); |
| if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) { |
| fprintf (stderr, "Malloc fails for ilsum[]."); |
| return (memDist + memNLU); |
| } |
| memNLU += (nsupers_i+1) * iword; |
| if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) { |
| fprintf (stderr, "Malloc fails for ilsum_j[]."); |
| return (memDist + memNLU); |
| } |
| memDist += (nsupers_j+1) * iword; |
| /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */ |
| ilsum[0] = 0; |
| ldaspa = 0; |
| for (gb = 0; gb < nsupers; gb++) |
| if ( myrow == PROW( gb, grid ) ) { |
| i = SuperSize( gb ); |
| ldaspa += i; |
| lb = LBi( gb, grid ); |
| ilsum[lb + 1] = ilsum[lb] + i; |
| } |
| ilsum[nsupers_i] = ldaspa; |
| ldaspa_j = 0; ilsum_j[0] = 0; |
| for (gb = 0; gb < nsupers; gb++) |
| if (mycol == PCOL( gb, grid )) { |
| i = SuperSize( gb ); |
| ldaspa_j += i; |
| lb = LBj( gb, grid ); |
| ilsum_j[lb + 1] = ilsum_j[lb] + i; |
| } |
| ilsum_j[nsupers_j] = ldaspa_j; |
| if ((memA = ddist_A(A, ScalePermstruct, Glu_persist, |
| grid, &ainf_colptr, &ainf_rowind, &ainf_val, |
| &asup_rowptr, &asup_colind, &asup_val, |
| ilsum, ilsum_j)) > 0) |
| return (memDist + memA + memNLU); |
| memDist += (-memA); |
| /* ------------------------------------------------------------ |
| FIRST TIME CREATING THE L AND U DATA STRUCTURES. |
| ------------------------------------------------------------*/ |
| /* We first need to set up the L and U data structures and then |
| * propagate the values of A into them. |
| */ |
| if ( !(ToRecv = intCalloc_dist(nsupers)) ) { |
| fprintf(stderr, "Calloc fails for ToRecv[]."); |
| return (memDist + memNLU); |
| } |
| memNLU += nsupers * iword; |
| k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */ |
| if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) { |
| fprintf(stderr, "Malloc fails for ToSendR[]."); |
| return (memDist + memNLU); |
| } |
| memNLU += k*sizeof(int_t*); |
| j = k * grid->npcol; |
| if ( !(index = intMalloc_dist(j)) ) { |
| fprintf(stderr, "Malloc fails for index[]."); |
| return (memDist + memNLU); |
| } |
| memNLU += j*iword; |
| for (i = 0; i < j; ++i) index[i] = EMPTY; |
| for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; |
| /* Auxiliary arrays used to set up L and U block data structures. |
| They are freed on return. */ |
| if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) { |
| fprintf(stderr, "Calloc fails for LUb_length[]."); |
| return (memDist + memNLU); |
| } |
| if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) { |
| fprintf(stderr, "Malloc fails for LUb_indptr[]."); |
| return (memDist + memNLU); |
| } |
| if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) { |
| fprintf(stderr, "Calloc fails for LUb_number[]."); |
| return (memDist + memNLU); |
| } |
| if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) { |
| fprintf(stderr, "Calloc fails for LUb_valptr[]."); |
| return (memDist + memNLU); |
| } |
| memDist += 4 * nsupers_ij * iword; |
| k = CEILING( nsupers, grid->nprow ); |
| /* Pointers to the beginning of each block row of U. */ |
| if ( !(Unzval_br_ptr = |
| (double**)SUPERLU_MALLOC(nsupers_i * sizeof(double*))) ) { |
| fprintf(stderr, "Malloc fails for Unzval_br_ptr[]."); |
| return (memDist + memNLU); |
| } |
| if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) { |
| fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[]."); |
| return (memDist + memNLU); |
| } |
| memNLU += nsupers_i*sizeof(double*) + nsupers_i*sizeof(int_t*); |
| Unzval_br_ptr[nsupers_i-1] = NULL; |
| Ufstnz_br_ptr[nsupers_i-1] = NULL; |
| if ( !(ToSendD = intCalloc_dist(nsupers_i)) ) { |
| fprintf(stderr, "Malloc fails for ToSendD[]."); |
| return (memDist + memNLU); |
| } |
| memNLU += nsupers_i*iword; |
| if ( !(Urb_marker = intCalloc_dist(nsupers_j))) { |
| fprintf(stderr, "Calloc fails for rb_marker[]."); |
| return (memDist + memNLU); |
| } |
| if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) { |
| fprintf(stderr, "Calloc fails for rb_marker[]."); |
| return (memDist + memNLU); |
| } |
| memDist += (nsupers_i + nsupers_j)*iword; |
| /* Auxiliary arrays used to set up L, U block data structures. |
| They are freed on return. |
| k is the number of local row blocks. */ |
| if ( !(dense = doubleCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) |
| * sp_ienv_dist(3))) ) { |
| fprintf(stderr, "Calloc fails for SPA dense[]."); |
| return (memDist + memNLU); |
| } |
| /* These counts will be used for triangular solves. */ |
| if ( !(fmod = intCalloc_dist(nsupers_i)) ) { |
| fprintf(stderr, "Calloc fails for fmod[]."); |
| return (memDist + memNLU); |
| } |
| if ( !(bmod = intCalloc_dist(nsupers_i)) ) { |
| fprintf(stderr, "Calloc fails for bmod[]."); |
| return (memDist + memNLU); |
| } |
| /* ------------------------------------------------ */ |
| memNLU += 2*nsupers_i*iword + |
| SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; |
| /* Pointers to the beginning of each block column of L. */ |
| if ( !(Lnzval_bc_ptr = |
| (double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) { |
| fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[]."); |
| return (memDist + memNLU); |
| } |
| if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) { |
| fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[]."); |
| return (memDist + memNLU); |
| } |
| memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*); |
| Lnzval_bc_ptr[nsupers_j-1] = NULL; |
| Lrowind_bc_ptr[nsupers_j-1] = NULL; |
| /* These lists of processes will be used for triangular solves. */ |
| if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { |
| fprintf(stderr, "Malloc fails for fsendx_plist[]."); |
| return (memDist + memNLU); |
| } |
| len = nsupers_j * grid->nprow; |
| if ( !(index = intMalloc_dist(len)) ) { |
| fprintf(stderr, "Malloc fails for fsendx_plist[0]"); |
| return (memDist + memNLU); |
| } |
| for (i = 0; i < len; ++i) index[i] = EMPTY; |
| for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow) |
| fsendx_plist[i] = &index[j]; |
| if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { |
| fprintf(stderr, "Malloc fails for bsendx_plist[]."); |
| return (memDist + memNLU); |
| } |
| if ( !(index = intMalloc_dist(len)) ) { |
| fprintf(stderr, "Malloc fails for bsendx_plist[0]"); |
| return (memDist + memNLU); |
| } |
| for (i = 0; i < len; ++i) index[i] = EMPTY; |
| for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow) |
| bsendx_plist[i] = &index[j]; |
| /* -------------------------------------------------------------- */ |
| memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword; |
| /*------------------------------------------------------------ |
| PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. |
| THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. |
| ------------------------------------------------------------*/ |
| for (jb = 0; jb < nsupers; jb++) { |
| jbcol = PCOL( jb, grid ); |
| jbrow = PROW( jb, grid ); |
| ljb_j = LBj( jb, grid ); /* Local block number column wise */ |
| ljb_i = LBi( jb, grid); /* Local block number row wise */ |
| fsupc = FstBlockC( jb ); |
| nsupc = SuperSize( jb ); |
| if ( myrow == jbrow ) { /* Block row jb in my process row */ |
| /* Scatter A into SPA. */ |
| for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) { |
| for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) { |
| if (i >= asup_rowptr[ilsum[nsupers_i]]) |
| printf ("ERR7\n"); |
| jcol = asup_colind[i]; |
| if (jcol >= n) |
| printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n", |
| iam, jb, gb, j, jcol); |
| gb = BlockNum( jcol ); |
| lb = LBj( gb, grid ); |
| if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n"); |
| jcol = ilsum_j[lb] + jcol - FstBlockC( gb ); |
| if (jcol >= ldaspa_j) |
| printf ("Pe[%d] ERR1 jb %d gb %d j %d jcol %d\n", |
| iam, jb, gb, j, jcol); |
| dense_col[jcol] = asup_val[i]; |
| } |
| dense_col += ldaspa_j; |
| } |
| /*------------------------------------------------ |
| * SET UP U BLOCKS. |
| *------------------------------------------------*/ |
| /* Count number of blocks and length of each block. */ |
| nrbu = 0; |
| len = 0; /* Number of column subscripts I own. */ |
| len1 = 0; /* number of fstnz subscripts */ |
| for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) { |
| if (i >= xusub[nsupers_i]) printf ("ERR10\n"); |
| jcol = usub[i]; |
| gb = BlockNum( jcol ); /* Global block number */ |
| /*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986) |
| printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n", |
| iam, jb, gb, jcol, jbcol, pc); */ |
| lb = LBj( gb, grid ); /* Local block number */ |
| pc = PCOL( gb, grid ); /* Process col owning this block */ |
| if (mycol == jbcol) ToSendR[ljb_j][pc] = YES; |
| /* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */ |
| pr = PROW( gb, grid ); |
| if ( pr != jbrow && mycol == pc) |
| bsendx_plist[lb][jbrow] = YES; |
| if (mycol == pc) { |
| len += nsupc; |
| LUb_length[lb] += nsupc; |
| ToSendD[ljb_i] = YES; |
| if (Urb_marker[lb] <= jb) { /* First see this block */ |
| if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++; |
| Urb_marker[lb] = jb + 1; |
| LUb_number[nrbu] = gb; |
| /* if (gb == 391825 && jb == 145361) |
| printf ("Pe[%d] T1 [%d %d] nrbu %d \n", |
| iam, jb, gb, nrbu); */ |
| nrbu ++; |
| len1 += SuperSize( gb ); |
| if ( gb != jb )/* Exclude diagonal block. */ |
| ++bmod[ljb_i];/* Mod. count for back solve */ |
| #if ( PRNTlevel>=1 ) |
| ++nUblocks; |
| #endif |
| } |
| } |
| } /* for i ... */ |
| if ( nrbu ) { |
| /* Sort the blocks of U in increasing block column index. |
| SuperLU_DIST assumes this is true */ |
| /* simple insert sort algorithm */ |
| /* to be transformed in quick sort */ |
| for (j = 1; j < nrbu; j++) { |
| k = LUb_number[j]; |
| for (i=j-1; i>=0 && LUb_number[i] > k; i--) { |
| LUb_number[i+1] = LUb_number[i]; |
| } |
| LUb_number[i+1] = k; |
| } |
| /* Set up the initial pointers for each block in |
| index[] and nzval[]. */ |
| /* Add room for descriptors */ |
| len1 += BR_HEADER + nrbu * UB_DESCRIPTOR; |
| if ( !(index = intMalloc_dist(len1+1)) ) { |
| fprintf (stderr, "Malloc fails for Uindex[]"); |
| return (memDist + memNLU); |
| } |
| Ufstnz_br_ptr[ljb_i] = index; |
| if (!(Unzval_br_ptr[ljb_i] = |
| doubleMalloc_dist(len))) { |
| fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]"); |
| return (memDist + memNLU); |
| } |
| memNLU += (len1+1)*iword + len*dword; |
| uval = Unzval_br_ptr[ljb_i]; |
| mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); |
| mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); |
| index[0] = nrbu; /* Number of column blocks */ |
| index[1] = len; /* Total length of nzval[] */ |
| index[2] = len1; /* Total length of index */ |
| index[len1] = -1; /* End marker */ |
| next_ind = BR_HEADER; |
| next_val = 0; |
| for (k = 0; k < nrbu; k++) { |
| gb = LUb_number[k]; |
| lb = LBj( gb, grid ); |
| len = LUb_length[lb]; |
| LUb_length[lb] = 0; /* Reset vector of block length */ |
| index[next_ind++] = gb; /* Descriptor */ |
| index[next_ind++] = len; |
| LUb_indptr[lb] = next_ind; |
| for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++) |
| index[next_ind] = FstBlockC( jb + 1 ); |
| LUb_valptr[lb] = next_val; |
| next_val += len; |
| } |
| /* Propagate the fstnz subscripts to Ufstnz_br_ptr[], |
| and the initial values of A from SPA into Unzval_br_ptr[]. */ |
| for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) { |
| jcol = usub[i]; |
| gb = BlockNum( jcol ); |
| if ( mycol == PCOL( gb, grid ) ) { |
| lb = LBj( gb, grid ); |
| k = LUb_indptr[lb]; /* Start fstnz in index */ |
| index[k + jcol - FstBlockC( gb )] = FstBlockC( jb ); |
| } |
| } /* for i ... */ |
| for (i = 0; i < nrbu; i++) { |
| gb = LUb_number[i]; |
| lb = LBj( gb, grid ); |
| next_ind = LUb_indptr[lb]; |
| k = FstBlockC( jb + 1); |
| jcol = ilsum_j[lb]; |
| for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) { |
| dense_col = dense; |
| j = index[next_ind+jj]; |
| for (ii = j; ii < k; ii++) { |
| uval[LUb_valptr[lb]++] = dense_col[jcol]; |
| dense_col[jcol] = zero; |
| dense_col += ldaspa_j; |
| } |
| } |
| } |
| } else { |
| Ufstnz_br_ptr[ljb_i] = NULL; |
| Unzval_br_ptr[ljb_i] = NULL; |
| } /* if nrbu ... */ |
| } /* if myrow == jbrow */ |
| /*------------------------------------------------ |
| * SET UP L BLOCKS. |
| *------------------------------------------------*/ |
| if (mycol == jbcol) { /* Block column jb in my process column */ |
| /* Scatter A_inf into SPA. */ |
| for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) { |
| for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) { |
| irow = ainf_rowind[i]; |
| if (irow >= n) printf ("Pe[%d] ERR1\n", iam); |
| gb = BlockNum( irow ); |
| if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam); |
| if ( myrow == PROW( gb, grid ) ) { |
| lb = LBi( gb, grid ); |
| irow = ilsum[lb] + irow - FstBlockC( gb ); |
| if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam); |
| dense_col[irow] = ainf_val[i]; |
| } |
| } |
| dense_col += ldaspa; |
| } |
| /* sort the indices of the diagonal block at the beginning of xlsub */ |
| if (myrow == jbrow) { |
| k = xlsub[ljb_j]; |
| for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { |
| irow = lsub[i]; |
| if (irow < nsupc + fsupc && i != k+irow-fsupc) { |
| lsub[i] = lsub[k + irow - fsupc]; |
| lsub[k + irow - fsupc] = irow; |
| i --; |
| } |
| } |
| } |
| /* Count number of blocks and length of each block. */ |
| nrbl = 0; |
| len = 0; /* Number of row subscripts I own. */ |
| kseen = 0; |
| for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { |
| irow = lsub[i]; |
| gb = BlockNum( irow ); /* Global block number */ |
| pr = PROW( gb, grid ); /* Process row owning this block */ |
| if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY && |
| myrow == jbrow) { |
| fsendx_plist[ljb_j][pr] = YES; |
| ++nfsendx; |
| } |
| if ( myrow == pr ) { |
| lb = LBi( gb, grid ); /* Local block number */ |
| if (Lrb_marker[lb] <= jb) { /* First see this block */ |
| Lrb_marker[lb] = jb + 1; |
| LUb_length[lb] = 1; |
| LUb_number[nrbl++] = gb; |
| if ( gb != jb ) /* Exclude diagonal block. */ |
| ++fmod[lb]; /* Mod. count for forward solve */ |
| if ( kseen == 0 && myrow != jbrow ) { |
| ++nfrecvx; |
| kseen = 1; |
| } |
| #if ( PRNTlevel>=1 ) |
| ++nLblocks; |
| #endif |
| } else |
| ++LUb_length[lb]; |
| ++len; |
| } |
| } /* for i ... */ |
| if ( nrbl ) { /* Do not ensure the blocks are sorted! */ |
| /* Set up the initial pointers for each block in |
| index[] and nzval[]. */ |
| /* If I am the owner of the diagonal block, order it first in LUb_number. |
| Necessary for SuperLU_DIST routines */ |
| kseen = EMPTY; |
| for (j = 0; j < nrbl; j++) { |
| if (LUb_number[j] == jb) |
| kseen = j; |
| } |
| if (kseen != EMPTY && kseen != 0) { |
| LUb_number[kseen] = LUb_number[0]; |
| LUb_number[0] = jb; |
| } |
| /* Add room for descriptors */ |
| len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; |
| if ( !(index = intMalloc_dist(len1)) ) { |
| fprintf (stderr, "Malloc fails for index[]"); |
| return (memDist + memNLU); |
| } |
| Lrowind_bc_ptr[ljb_j] = index; |
| if (!(Lnzval_bc_ptr[ljb_j] = |
| doubleMalloc_dist(len*nsupc))) { |
| fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d ", jb); |
| return (memDist + memNLU); |
| } |
| memNLU += len1*iword + len*nsupc*dword; |
| lusup = Lnzval_bc_ptr[ljb_j]; |
| mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); |
| mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); |
| mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); |
| index[0] = nrbl; /* Number of row blocks */ |
| index[1] = len; /* LDA of the nzval[] */ |
| next_ind = BC_HEADER; |
| next_val = 0; |
| for (k = 0; k < nrbl; ++k) { |
| gb = LUb_number[k]; |
| lb = LBi( gb, grid ); |
| len = LUb_length[lb]; |
| LUb_length[lb] = 0; |
| index[next_ind++] = gb; /* Descriptor */ |
| index[next_ind++] = len; |
| LUb_indptr[lb] = next_ind; |
| LUb_valptr[lb] = next_val; |
| next_ind += len; |
| next_val += len; |
| } |
| /* Propagate the compressed row subscripts to Lindex[], |
| and the initial values of A from SPA into Lnzval[]. */ |
| len = index[1]; /* LDA of lusup[] */ |
| for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { |
| irow = lsub[i]; |
| gb = BlockNum( irow ); |
| if ( myrow == PROW( gb, grid ) ) { |
| lb = LBi( gb, grid ); |
| k = LUb_indptr[lb]++; /* Random access a block */ |
| index[k] = irow; |
| k = LUb_valptr[lb]++; |
| irow = ilsum[lb] + irow - FstBlockC( gb ); |
| for (j = 0, dense_col = dense; j < nsupc; ++j) { |
| lusup[k] = dense_col[irow]; |
| dense_col[irow] = zero; |
| k += len; |
| dense_col += ldaspa; |
| } |
| } |
| } /* for i ... */ |
| } else { |
| Lrowind_bc_ptr[ljb_j] = NULL; |
| Lnzval_bc_ptr[ljb_j] = NULL; |
| } /* if nrbl ... */ |
| } /* if mycol == pc */ |
| } /* for jb ... */ |
| SUPERLU_FREE(ilsum_j); |
| SUPERLU_FREE(Urb_marker); |
| SUPERLU_FREE(LUb_length); |
| SUPERLU_FREE(LUb_indptr); |
| SUPERLU_FREE(LUb_number); |
| SUPERLU_FREE(LUb_valptr); |
| SUPERLU_FREE(Lrb_marker); |
| SUPERLU_FREE(dense); |
| /* Free the memory used for storing L and U */ |
| SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub); |
| if (lsub != NULL) |
| SUPERLU_FREE(lsub); |
| if (usub != NULL) |
| SUPERLU_FREE(usub); |
| /* Free the memory used for storing A */ |
| SUPERLU_FREE(ainf_colptr); |
| if (ainf_rowind != NULL) { |
| SUPERLU_FREE(ainf_rowind); |
| SUPERLU_FREE(ainf_val); |
| } |
| SUPERLU_FREE(asup_rowptr); |
| if (asup_colind != NULL) { |
| SUPERLU_FREE(asup_colind); |
| SUPERLU_FREE(asup_val); |
| } |
| /* exchange information about bsendx_plist in between column of processors */ |
| k = SUPERLU_MAX( grid->nprow, grid->npcol); |
| if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) { |
| fprintf (stderr, "Malloc fails for recvBuf[]."); |
| return (memDist + memNLU); |
| } |
| if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { |
| fprintf (stderr, "Malloc fails for nnzToRecv[]."); |
| return (memDist + memNLU); |
| } |
| if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { |
| fprintf (stderr, "Malloc fails for ptrToRecv[]."); |
| return (memDist + memNLU); |
| } |
| if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { |
| fprintf (stderr, "Malloc fails for nnzToRecv[]."); |
| return (memDist + memNLU); |
| } |
| if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { |
| fprintf (stderr, "Malloc fails for ptrToRecv[]."); |
| return (memDist + memNLU); |
| } |
| if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int))) |
| memDist = nsupers*k*iword +4*nprocs * sizeof(int); |
| for (p = 0; p < nprocs; p++) |
| nnzToRecv[p] = 0; |
| for (jb = 0; jb < nsupers; jb++) { |
| jbcol = PCOL( jb, grid ); |
| jbrow = PROW( jb, grid ); |
| p = PNUM(jbrow, jbcol, grid); |
| nnzToRecv[p] += grid->npcol; |
| } |
| i = 0; |
| for (p = 0; p < nprocs; p++) { |
| ptrToRecv[p] = i; |
| i += nnzToRecv[p]; |
| ptrToSend[p] = 0; |
| if (p != iam) |
| nnzToSend[p] = nnzToRecv[iam]; |
| else |
| nnzToSend[p] = 0; |
| } |
| nnzToRecv[iam] = 0; |
| i = ptrToRecv[iam]; |
| for (jb = 0; jb < nsupers; jb++) { |
| jbcol = PCOL( jb, grid ); |
| jbrow = PROW( jb, grid ); |
| p = PNUM(jbrow, jbcol, grid); |
| if (p == iam) { |
| ljb_j = LBj( jb, grid ); /* Local block number column wise */ |
| for (j = 0; j < grid->npcol; j++, i++) |
| recvBuf[i] = ToSendR[ljb_j][j]; |
| } |
| } |
| MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t, |
| recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm); |
| for (jb = 0; jb < nsupers; jb++) { |
| jbcol = PCOL( jb, grid ); |
| jbrow = PROW( jb, grid ); |
| p = PNUM(jbrow, jbcol, grid); |
| ljb_j = LBj( jb, grid ); /* Local block number column wise */ |
| ljb_i = LBi( jb, grid ); /* Local block number row wise */ |
| /* (myrow == jbrow) { |
| if (ToSendD[ljb_i] == YES) |
| ToRecv[jb] = 1; |
| } |
| else { |
| if (recvBuf[ptrToRecv[p] + mycol] == YES) |
| ToRecv[jb] = 2; |
| } */ |
| if (recvBuf[ptrToRecv[p] + mycol] == YES) { |
| if (myrow == jbrow) |
| ToRecv[jb] = 1; |
| else |
| ToRecv[jb] = 2; |
| } |
| if (mycol == jbcol) { |
| for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) |
| ToSendR[ljb_j][i] = recvBuf[j]; |
| ToSendR[ljb_j][mycol] = EMPTY; |
| } |
| ptrToRecv[p] += grid->npcol; |
| } |
| /* exchange information about bsendx_plist in between column of processors */ |
| MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t, |
| MPI_MAX, grid->cscp.comm); |
| for (jb = 0; jb < nsupers; jb ++) { |
| jbcol = PCOL( jb, grid); |
| jbrow = PROW( jb, grid); |
| if (mycol == jbcol) { |
| ljb_j = LBj( jb, grid ); /* Local block number column wise */ |
| if (myrow == jbrow ) { |
| for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) { |
| (*bsendx_plist)[k] = recvBuf[k]; |
| if ((*bsendx_plist)[k] != EMPTY) |
| nbsendx ++; |
| } |
| } |
| else { |
| for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) |
| (*bsendx_plist)[k] = EMPTY; |
| } |
| } |
| } |
| SUPERLU_FREE(nnzToRecv); |
| SUPERLU_FREE(ptrToRecv); |
| SUPERLU_FREE(nnzToSend); |
| SUPERLU_FREE(ptrToSend); |
| SUPERLU_FREE(recvBuf); |
| Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; |
| Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; |
| Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; |
| Llu->Unzval_br_ptr = Unzval_br_ptr; |
| Llu->ToRecv = ToRecv; |
| Llu->ToSendD = ToSendD; |
| Llu->ToSendR = ToSendR; |
| Llu->fmod = fmod; |
| Llu->fsendx_plist = fsendx_plist; |
| Llu->nfrecvx = nfrecvx; |
| Llu->nfsendx = nfsendx; |
| Llu->bmod = bmod; |
| Llu->bsendx_plist = bsendx_plist; |
| Llu->nbrecvx = nbrecvx; |
| Llu->nbsendx = nbsendx; |
| Llu->ilsum = ilsum; |
| Llu->ldalsum = ldaspa; |
| LUstruct->Glu_persist = Glu_persist; |
| #if ( PRNTlevel>=1 ) |
| if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", |
| nLblocks, nUblocks); |
| #endif |
| k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ |
| if ( !(Llu->mod_bit = intMalloc_dist(k)) ) |
| ABORT("Malloc fails for mod_bit[]."); |
| /* Find the maximum buffer size. */ |
| MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, |
| MPI_MAX, grid->comm); |
| #if ( DEBUGlevel>=1 ) |
| /* Memory allocated but not freed: |
| ilsum, fmod, fsendx_plist, bmod, bsendx_plist, |
| ToRecv, ToSendR, ToSendD, mod_bit |
| */ |
| CHECK_MALLOC(iam, "Exit dist_psymbtonum()"); |
| #endif |
| return (- (memDist+memNLU)); |
| } /* ddist_psymbtonum */ |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/Cnames.h |
|---|
| New file |
| 0,0 → 1,349 |
| /*! @file |
| * \brief Macro definitions |
| * |
| * <pre> |
| * -- Distributed SuperLU routine (version 1.0) -- |
| * Lawrence Berkeley National Lab, Univ. of California Berkeley. |
| * September 1, 1999 |
| * </pre> |
| */ |
| #ifndef __SUPERLU_CNAMES /* allow multiple inclusions */ |
| #define __SUPERLU_CNAMES |
| /* |
| * These macros define how C routines will be called. ADD_ assumes that |
| * they will be called by fortran, which expects C routines to have an |
| * underscore postfixed to the name (Suns, and the Intel expect this). |
| * NOCHANGE indicates that fortran will be calling, and that it expects |
| * the name called by fortran to be identical to that compiled by the C |
| * (RS6K's do this). UPCASE says it expects C routines called by fortran |
| * to be in all upcase (CRAY wants this). |
| */ |
| #define ADD_ 0 |
| #define NOCHANGE 1 |
| #define UPCASE 2 |
| #define C_CALL 3 |
| #ifdef UpCase |
| #define F77_CALL_C UPCASE |
| #endif |
| #ifdef NoChange |
| #define F77_CALL_C NOCHANGE |
| #endif |
| #ifdef Add_ |
| #define F77_CALL_C ADD_ |
| #endif |
| #ifndef F77_CALL_C |
| #define F77_CALL_C ADD_ |
| #endif |
| #if (F77_CALL_C == ADD_) |
| /* |
| * These defines set up the naming scheme required to have a fortran 77 |
| * routine call a C routine |
| * No redefinition necessary to have following Fortran to C interface: |
| * FORTRAN CALL C DECLARATION |
| * call dgemm(...) void dgemm_(...) |
| * |
| * This is the default. |
| */ |
| /* These are the functions defined in F90 wraper */ |
| #define f_create_gridinfo_handle f_create_gridinfo_handle_ |
| #define f_create_options_handle f_create_options_handle_ |
| #define f_create_ScalePerm_handle f_create_scaleperm_handle_ |
| #define f_create_LUstruct_handle f_create_lustruct_handle_ |
| #define f_create_SOLVEstruct_handle f_create_solvestruct_handle_ |
| #define f_create_SuperMatrix_handle f_create_supermatrix_handle_ |
| #define f_destroy_gridinfo_handle f_destroy_gridinfo_handle_ |
| #define f_destroy_options_handle f_destroy_options_handle_ |
| #define f_destroy_ScalePerm_handle f_destroy_scaleperm_handle_ |
| #define f_destroy_LUstruct_handle f_destroy_lustruct_handle_ |
| #define f_destroy_SOLVEstruct_handle f_destroy_solvestruct_handle_ |
| #define f_destroy_SuperMatrix_handle f_destroy_supermatrix_handle_ |
| #define f_create_SuperLUStat_handle f_create_superlustat_handle_ |
| #define f_destroy_SuperLUStat_handle f_destroy_superlustat_handle_ |
| #define f_get_gridinfo f_get_gridinfo_ |
| #define f_get_SuperMatrix f_get_supermatrix_ |
| #define f_set_SuperMatrix f_set_supermatrix_ |
| #define f_get_CompRowLoc_Matrix f_get_comprowloc_matrix_ |
| #define f_set_CompRowLoc_Matrix f_set_comprowloc_matrix_ |
| #define f_get_superlu_options f_get_superlu_options_ |
| #define f_set_superlu_options f_set_superlu_options_ |
| #define f_set_default_options f_set_default_options_ |
| #define f_superlu_gridinit f_superlu_gridinit_ |
| #define f_superlu_gridexit f_superlu_gridexit_ |
| #define f_ScalePermstructInit f_scalepermstructinit_ |
| #define f_ScalePermstructFree f_scalepermstructfree_ |
| #define f_PStatInit f_pstatinit_ |
| #define f_PStatFree f_pstatfree_ |
| #define f_LUstructInit f_lustructinit_ |
| #define f_LUstructFree f_lustructfree_ |
| #define f_Destroy_LU f_destroy_lu_ |
| #define f_dCreate_CompRowLoc_Mat_dist f_dcreate_comprowloc_mat_dist_ |
| #define f_Destroy_CompRowLoc_Mat_dist f_destroy_comprowloc_mat_dist_ |
| #define f_Destroy_SuperMat_Store_dist f_destroy_supermat_store_dist_ |
| #define f_dSolveFinalize f_dsolvefinalize_ |
| #define f_pdgssvx f_pdgssvx_ |
| #define f_dcreate_dist_matrix f_dcreate_dist_matrix_ |
| #define f_check_malloc f_check_malloc_ |
| #endif |
| #if (F77_CALL_C == UPCASE) |
| /* |
| * These defines set up the naming scheme required to have a fortran 77 |
| * routine call a C routine |
| * following Fortran to C interface: |
| * FORTRAN CALL C DECLARATION |
| * call dgemm(...) void DGEMM(...) |
| */ |
| /* BLAS */ |
| #define sasum_ SASUM |
| #define isamax_ ISAMAX |
| #define scopy_ SCOPY |
| #define sscal_ SSCAL |
| #define sger_ SGER |
| #define snrm2_ SNRM2 |
| #define ssymv_ SSYMV |
| #define sdot_ SDOT |
| #define saxpy_ SAXPY |
| #define ssyr2_ SSYR2 |
| #define srot_ SROT |
| #define sgemv_ SGEMV |
| #define strsv_ STRSV |
| #define sgemm_ SGEMM |
| #define strsm_ STRSM |
| #define dasum_ DASUM |
| #define idamax_ IDAMAX |
| #define dcopy_ DCOPY |
| #define dscal_ DSCAL |
| #define dger_ DGER |
| #define dnrm2_ DNRM2 |
| #define dsymv_ DSYMV |
| #define ddot_ DDOT |
| #define daxpy_ DAXPY |
| #define dsyr2_ DSYR2 |
| #define drot_ DROT |
| #define dgemv_ DGEMV |
| #define dtrsv_ DTRSV |
| #define dgemm_ DGEMM |
| #define dtrsm_ DTRSM |
| #define scasum_ SCASUM |
| #define icamax_ ICAMAX |
| #define ccopy_ CCOPY |
| #define cscal_ CSCAL |
| #define scnrm2_ SCNRM2 |
| #define caxpy_ CAXPY |
| #define cgemv_ CGEMV |
| #define ctrsv_ CTRSV |
| #define cgemm_ CGEMM |
| #define ctrsm_ CTRSM |
| #define cgerc_ CGERC |
| #define chemv_ CHEMV |
| #define cher2_ CHER2 |
| #define dzasum_ DZASUM |
| #define izamax_ IZAMAX |
| #define zcopy_ ZCOPY |
| #define zscal_ ZSCAL |
| #define dznrm2_ DZNRM2 |
| #define zaxpy_ ZAXPY |
| #define zgemv_ ZGEMV |
| #define ztrsv_ ZTRSV |
| #define zgemm_ ZGEMM |
| #define ztrsm_ ZTRSM |
| #define zgerc_ ZGERC |
| #define zhemv_ ZHEMV |
| #define zher2_ ZHER2 |
| #define zgeru_ ZGERU |
| /* LAPACK */ |
| #define dlamch_ DLAMCH |
| #define slamch_ SLAMCH |
| #define xerbla_ XERBLA |
| #define lsame_ LSAME |
| #define mc64id_ MC64ID |
| #define mc64ad_ MC64AD |
| #define c_bridge_dgssv_ C_BRIDGE_DGSSV |
| #define c_fortran_slugrid_ C_FORTRAN_SLUGRID |
| #define c_fortran_pdgssvx_ C_FORTRAN_PDGSSVX |
| #define c_fortran_pdgssvx_ABglobal_ C_FORTRAN_PDGSSVX_ABGLOBAL |
| #define c_fortran_pzgssvx_ C_FORTRAN_PZGSSVX |
| #define c_fortran_pzgssvx_ABglobal_ C_FORTRAN_PZGSSVX_ABGLOBAL |
| /* These are the functions defined in F90 wraper */ |
| #define f_create_gridinfo_handle F_CREATE_GRIDINFO_HANDLE |
| #define f_create_options_handle F_CREATE_OPTIONS_HANDLE |
| #define f_create_ScalePerm_handle F_CREATE_SCALEPERM_HANDLE |
| #define f_create_LUstruct_handle F_CREATE_LUSTRUCT_HANDLE |
| #define f_create_SOLVEstruct_handle F_CREATE_SOLVESTRUCT_HANDLE |
| #define f_create_SuperMatrix_handle F_CREATE_SUPERMATRIX_HANDLE |
| #define f_destroy_gridinfo_handle F_DESTROY_GRIDINFO_HANDLE |
| #define f_destroy_options_handle F_DESTROY_OPTIONS_HANDLE |
| #define f_destroy_ScalePerm_handle F_DESTROY_SCALEPERM_HANDLE |
| #define f_destroy_LUstruct_handle F_DESTROY_LUSTRUCT_HANDLE |
| #define f_destroy_SOLVEstruct_handle F_DESTROY_SOLVESTRUCT_HANDLE |
| #define f_destroy_SuperMatrix_handle F_DESTROY_SUPERMATRIX_HANDLE |
| #define f_create_SuperLUStat_handle F_CREATE_SUPERLUSTAT_HANDLE |
| #define f_destroy_SuperLUStat_handle F_DESTROY_SUPERLUSTAT_HANDLE |
| #define f_get_gridinfo F_GET_GRIDINFO |
| #define f_get_SuperMatrix F_GET_SUPERMATRIX |
| #define f_set_SuperMatrix F_SET_SUPERMATRIX |
| #define f_get_CompRowLoc_Matrix F_GET_COMPROWLOC_MATRIX |
| #define f_set_CompRowLoc_Matrix F_SET_COMPROWLOC_MATRIX |
| #define f_get_superlu_options F_GET_SUPERLU_OPTIONS |
| #define f_set_superlu_options F_SET_SUPERLU_OPTIONS |
| #define f_set_default_options F_SET_DEFAULT_OPTIONS |
| #define f_superlu_gridinit F_SUPERLU_GRIDINIT |
| #define f_superlu_gridexit F_SUPERLU_GRIDEXIT |
| #define f_ScalePermstructInit F_SCALEPERMSTRUCTINIT |
| #define f_ScalePermstructFree F_SCALEPERMSTRUCTFREE |
| #define f_PStatInit F_PSTATINIT |
| #define f_PStatFree F_PSTATFREE |
| #define f_LUstructInit F_LUSTRUCTINIT |
| #define f_LUstructFree F_LUSTRUCTFREE |
| #define f_Destroy_LU F_DESTROY_LU |
| #define f_dCreate_CompRowLoc_Mat_dist F_DCREATE_COMPROWLOC_MAT_DIST |
| #define f_Destroy_CompRowLoc_Mat_dist F_DESTROY_COMPROWLOC_MAT_DIST |
| #define f_Destroy_SuperMat_Store_dist F_DESTROY_SUPERMAT_STORE_DIST |
| #define f_dSolveFinalize F_DSOLVEFINALIZE |
| #define f_pdgssvx F_PDGSSVX |
| #define f_dcreate_dist_matrix F_DCREATE_DIST_MATRIX |
| #define f_check_malloc F_CHECK_MALLOC |
| #endif |
| #if (F77_CALL_C == NOCHANGE) |
| /* |
| * These defines set up the naming scheme required to have a fortran 77 |
| * routine call a C routine |
| * for following Fortran to C interface: |
| * FORTRAN CALL C DECLARATION |
| * call dgemm(...) void dgemm(...) |
| */ |
| /* BLAS */ |
| #define sasum_ sasum |
| #define isamax_ isamax |
| #define scopy_ scopy |
| #define sscal_ sscal |
| #define sger_ sger |
| #define snrm2_ snrm2 |
| #define ssymv_ ssymv |
| #define sdot_ sdot |
| #define saxpy_ saxpy |
| #define ssyr2_ ssyr2 |
| #define srot_ srot |
| #define sgemv_ sgemv |
| #define strsv_ strsv |
| #define sgemm_ sgemm |
| #define strsm_ strsm |
| #define dasum_ dasum |
| #define idamax_ idamax |
| #define dcopy_ dcopy |
| #define dscal_ dscal |
| #define dger_ dger |
| #define dnrm2_ dnrm2 |
| #define dsymv_ dsymv |
| #define ddot_ ddot |
| #define daxpy_ daxpy |
| #define dsyr2_ dsyr2 |
| #define drot_ drot |
| #define dgemv_ dgemv |
| #define dtrsv_ dtrsv |
| #define dgemm_ dgemm |
| #define dtrsm_ dtrsm |
| #define scasum_ scasum |
| #define icamax_ icamax |
| #define ccopy_ ccopy |
| #define cscal_ cscal |
| #define scnrm2_ scnrm2 |
| #define caxpy_ caxpy |
| #define cgemv_ cgemv |
| #define ctrsv_ ctrsv |
| #define cgemm_ cgemm |
| #define ctrsm_ ctrsm |
| #define cgerc_ cgerc |
| #define chemv_ chemv |
| #define cher2_ cher2 |
| #define dzasum_ dzasum |
| #define izamax_ izamax |
| #define zcopy_ zcopy |
| #define zscal_ zscal |
| #define dznrm2_ dznrm2 |
| #define zaxpy_ zaxpy |
| #define zgemv_ zgemv |
| #define ztrsv_ ztrsv |
| #define zgemm_ zgemm |
| #define ztrsm_ ztrsm |
| #define zgerc_ zgerc |
| #define zhemv_ zhemv |
| #define zher2_ zher2 |
| #define zgeru_ zgeru |
| /* LAPACK */ |
| #define dlamch_ dlamch |
| #define slamch_ slamch |
| #define xerbla_ xerbla |
| #define lsame_ lsame |
| #define mc64id_ mc64id |
| #define mc64ad_ mc64ad |
| #define c_bridge_dgssv_ c_bridge_dgssv |
| #define c_fortran_slugrid_ c_fortran_slugrid |
| #define c_fortran_pdgssvx_ c_fortran_pdgssvx |
| #define c_fortran_pdgssvx_ABglobal_ c_fortran_pdgssvx_abglobal |
| #define c_fortran_pzgssvx_ c_fortran_pzgssvx |
| #define c_fortran_pzgssvx_ABglobal_ c_fortran_pzgssvx_abglobal |
| /* These are the functions defined in F90 wraper */ |
| #define f_create_gridinfo_handle f_create_gridinfo_handle |
| #define f_create_options_handle f_create_options_handle |
| #define f_create_ScalePerm_handle f_create_scaleperm_handle |
| #define f_create_LUstruct_handle f_create_lustruct_handle |
| #define f_create_SOLVEstruct_handle f_create_solvestruct_handle |
| #define f_create_SuperMatrix_handle f_create_supermatrix_handle |
| #define f_destroy_gridinfo_handle f_destroy_gridinfo_handle |
| #define f_destroy_options_handle f_destroy_options_handle |
| #define f_destroy_ScalePerm_handle f_destroy_scaleperm_handle |
| #define f_destroy_LUstruct_handle f_destroy_lustruct_handle |
| #define f_destroy_SOLVEstruct_handle f_destroy_solvestruct_handle |
| #define f_destroy_SuperMatrix_handle f_destroy_supermatrix_handle |
| #define f_create_SuperLUStat_handle f_create_superlustat_handle |
| #define f_destroy_SuperLUStat_handle f_destroy_superlustat_handle |
| #define f_get_gridinfo f_get_gridinfo |
| #define f_get_SuperMatrix f_get_supermatrix |
| #define f_set_SuperMatrix f_set_supermatrix |
| #define f_get_CompRowLoc_Matrix f_get_comprowloc_matrix |
| #define f_set_CompRowLoc_Matrix f_set_comprowloc_matrix |
| #define f_get_superlu_options f_get_superlu_options |
| #define f_set_superlu_options f_set_superlu_options |
| #define f_set_default_options f_set_default_options |
| #define f_superlu_gridinit f_superlu_gridinit |
| #define f_superlu_gridexit f_superlu_gridexit |
| #define f_ScalePermstructInit f_scalepermstructinit |
| #define f_ScalePermstructFree f_scalepermstructfree |
| #define f_PStatInit f_pstatinit |
| #define f_PStatFree f_pstatfree |
| #define f_LUstructInit f_lustructinit |
| #define f_LUstructFree f_lustructfree |
| #define f_Destroy_LU f_destroy_lu |
| #define f_dCreate_CompRowLoc_Mat_dist f_dcreate_comprowloc_mat_dist |
| #define f_Destroy_CompRowLoc_Mat_dist f_destroy_comprowloc_mat_dist |
| #define f_Destroy_SuperMat_Store_dist f_destroy_supermat_store_dist |
| #define f_dSolveFinalize f_dsolvefinalize |
| #define f_pdgssvx f_pdgssvx |
| #define f_dcreate_dist_matrix f_dcreate_dist_matrix |
| #define f_check_malloc f_check_malloc |
| #endif |
| #endif /* __SUPERLU_CNAMES */ |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/dmemory.c |
|---|
| New file |
| 0,0 → 1,152 |
| /*! @file |
| * \brief Memory utilities |
| * |
| * <pre> |
| * -- Distributed SuperLU routine (version 1.0) -- |
| * Lawrence Berkeley National Lab, Univ. of California Berkeley. |
| * September 1, 1999 |
| * </pre> |
| */ |
| #include "superlu_ddefs.h" |
| /* Variables external to this file */ |
| extern LU_stack_t stack; |
| void *duser_malloc_dist(int_t bytes, int_t which_end) |
| { |
| void *buf; |
| if ( StackFull(bytes) ) return (NULL); |
| if ( which_end == HEAD ) { |
| buf = (char*) stack.array + stack.top1; |
| stack.top1 += bytes; |
| } else { |
| stack.top2 -= bytes; |
| buf = (char*) stack.array + stack.top2; |
| } |
| stack.used += bytes; |
| return buf; |
| } |
| void duser_free_dist(int_t bytes, int_t which_end) |
| { |
| if ( which_end == HEAD ) { |
| stack.top1 -= bytes; |
| } else { |
| stack.top2 += bytes; |
| } |
| stack.used -= bytes; |
| } |
| /*! \brief |
| * |
| * <pre> |
| * mem_usage consists of the following fields: |
| * - for_lu (float) |
| * The amount of space used in bytes for the L\U data structures. |
| * - total (float) |
| * The amount of space needed in bytes to perform factorization. |
| * - expansions (int) |
| * Number of memory expansions during the LU factorization. |
| * </pre> |
| */ |
| int_t dQuerySpace_dist(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, |
| mem_usage_t *mem_usage) |
| { |
| register int_t dword, gb, iword, k, maxsup, nb, nsupers; |
| int_t *index, *xsup; |
| int iam, mycol, myrow; |
| Glu_persist_t *Glu_persist = LUstruct->Glu_persist; |
| LocalLU_t *Llu = LUstruct->Llu; |
| iam = grid->iam; |
| myrow = MYROW( iam, grid ); |
| mycol = MYCOL( iam, grid ); |
| iword = sizeof(int_t); |
| dword = sizeof(double); |
| maxsup = sp_ienv_dist(3); |
| nsupers = Glu_persist->supno[n-1] + 1; |
| xsup = Glu_persist->xsup; |
| mem_usage->for_lu = 0; |
| /* For L factor */ |
| nb = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */ |
| for (k = 0; k < nb; ++k) { |
| gb = k * grid->npcol + mycol; /* Global block number. */ |
| if ( gb < nsupers ) { |
| index = Llu->Lrowind_bc_ptr[k]; |
| if ( index ) { |
| mem_usage->for_lu += (float) |
| ((BC_HEADER + index[0]*LB_DESCRIPTOR + index[1]) * iword); |
| mem_usage->for_lu += (float)(index[1]*SuperSize( gb )*dword); |
| } |
| } |
| } |
| /* For U factor */ |
| nb = CEILING( nsupers, grid->nprow ); /* Number of local row blocks */ |
| for (k = 0; k < nb; ++k) { |
| gb = k * grid->nprow + myrow; /* Global block number. */ |
| if ( gb < nsupers ) { |
| index = Llu->Ufstnz_br_ptr[k]; |
| if ( index ) { |
| mem_usage->for_lu += (float)(index[2] * iword); |
| mem_usage->for_lu += (float)(index[1] * dword); |
| } |
| } |
| } |
| /* Working storage to support factorization */ |
| mem_usage->total = mem_usage->for_lu; |
| mem_usage->total += |
| (float)(( Llu->bufmax[0] + Llu->bufmax[2] ) * iword + |
| ( Llu->bufmax[1] + Llu->bufmax[3] + maxsup ) * dword ); |
| /**** another buffer to use mpi_irecv in pdgstrf_irecv.c ****/ |
| mem_usage->total += |
| (float)( Llu->bufmax[0] * iword + Llu->bufmax[1] * dword ); |
| mem_usage->total += (float)( maxsup * maxsup + maxsup) * iword; |
| k = CEILING( nsupers, grid->nprow ); |
| mem_usage->total += (float)(2 * k * iword); |
| return 0; |
| } /* dQuerySpace_dist */ |
| /* |
| * Allocate storage for original matrix A |
| */ |
| void |
| dallocateA_dist(int_t n, int_t nnz, double **a, int_t **asub, int_t **xa) |
| { |
| *a = (double *) doubleMalloc_dist(nnz); |
| *asub = (int_t *) intMalloc_dist(nnz); |
| *xa = (int_t *) intMalloc_dist(n+1); |
| } |
| double *doubleMalloc_dist(int_t n) |
| { |
| double *buf; |
| buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double) ); |
| return (buf); |
| } |
| double *doubleCalloc_dist(int_t n) |
| { |
| double *buf; |
| register int_t i; |
| double zero = 0.0; |
| buf = (double *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(double)); |
| if ( !buf ) return (buf); |
| for (i = 0; i < n; ++i) buf[i] = zero; |
| return (buf); |
| } |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/xerbla.c |
|---|
| New file |
| 0,0 → 1,50 |
| /*! @file |
| * \brief |
| <pre> |
| -- LAPACK auxiliary routine (version 2.0) -- |
| Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., |
| Courant Institute, Argonne National Lab, and Rice University |
| September 30, 1994 |
| </pre> |
| */ |
| #include "Cnames.h" |
| /* Subroutine */ |
| /*! \brief |
| <pre> |
| Purpose |
| ======= |
| XERBLA is an error handler for the LAPACK routines. |
| It is called by an LAPACK routine if an input parameter has an |
| invalid value. A message is printed and execution stops. |
| Installers may consider modifying the STOP statement in order to |
| call system-specific exception-handling facilities. |
| Arguments |
| ========= |
| SRNAME (input) CHARACTER*6 |
| The name of the routine which called XERBLA. |
| INFO (input) INT |
| The position of the invalid parameter in the parameter list |
| of the calling routine. |
| ===================================================================== |
| </pre> |
| */ |
| int xerbla_(char *srname, int *info) |
| { |
| printf("** On entry to %6s, parameter number %2d had an illegal value\n", |
| srname, *info); |
| /* End of XERBLA */ |
| return 0; |
| } /* xerbla_ */ |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/util.c |
|---|
| New file |
| 0,0 → 1,786 |
| /*! @file |
| * \brief Utilities functions |
| * |
| * <pre> |
| * -- Distributed SuperLU routine (version 2.0) -- |
| * Lawrence Berkeley National Lab, Univ. of California Berkeley. |
| * February 1, 2003 |
| * </pre> |
| */ |
| #include <math.h> |
| #include "superlu_ddefs.h" |
| /*! \brief Deallocate the structure pointing to the actual storage of the matrix. */ |
| void |
| Destroy_SuperMatrix_Store_dist(SuperMatrix *A) |
| { |
| SUPERLU_FREE ( A->Store ); |
| } |
| void |
| Destroy_CompCol_Matrix_dist(SuperMatrix *A) |
| { |
| NCformat *Astore = A->Store; |
| SUPERLU_FREE( Astore->rowind ); |
| SUPERLU_FREE( Astore->colptr ); |
| if ( Astore->nzval ) SUPERLU_FREE( Astore->nzval ); |
| SUPERLU_FREE( Astore ); |
| } |
| void |
| Destroy_CompRowLoc_Matrix_dist(SuperMatrix *A) |
| { |
| NRformat_loc *Astore = A->Store; |
| SUPERLU_FREE( Astore->rowptr ); |
| SUPERLU_FREE( Astore->colind ); |
| SUPERLU_FREE( Astore->nzval ); |
| SUPERLU_FREE( Astore ); |
| } |
| void |
| Destroy_CompRow_Matrix_dist(SuperMatrix *A) |
| { |
| SUPERLU_FREE( ((NRformat *)A->Store)->rowptr ); |
| SUPERLU_FREE( ((NRformat *)A->Store)->colind ); |
| SUPERLU_FREE( ((NRformat *)A->Store)->nzval ); |
| SUPERLU_FREE( A->Store ); |
| } |
| void |
| Destroy_SuperNode_Matrix_dist(SuperMatrix *A) |
| { |
| SUPERLU_FREE ( ((SCformat *)A->Store)->rowind ); |
| SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr ); |
| SUPERLU_FREE ( ((SCformat *)A->Store)->nzval ); |
| SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr ); |
| SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup ); |
| SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col ); |
| SUPERLU_FREE ( A->Store ); |
| } |
| /*! \brief A is of type Stype==NCP */ |
| void |
| Destroy_CompCol_Permuted_dist(SuperMatrix *A) |
| { |
| SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg ); |
| SUPERLU_FREE ( ((NCPformat *)A->Store)->colend ); |
| SUPERLU_FREE ( A->Store ); |
| } |
| /*! \brief A is of type Stype==DN */ |
| void |
| Destroy_Dense_Matrix_dist(SuperMatrix *A) |
| { |
| DNformat* Astore = A->Store; |
| SUPERLU_FREE (Astore->nzval); |
| SUPERLU_FREE ( A->Store ); |
| } |
| /*! \brief Destroy distributed L & U matrices. */ |
| void |
| Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) |
| { |
| int_t i, nb, nsupers; |
| Glu_persist_t *Glu_persist = LUstruct->Glu_persist; |
| LocalLU_t *Llu = LUstruct->Llu; |
| #if ( DEBUGlevel>=1 ) |
| int iam; |
| MPI_Comm_rank( MPI_COMM_WORLD, &iam ); |
| CHECK_MALLOC(iam, "Enter Destroy_LU()"); |
| #endif |
| nsupers = Glu_persist->supno[n-1] + 1; |
| nb = CEILING(nsupers, grid->npcol); |
| for (i = 0; i < nb; ++i) |
| if ( Llu->Lrowind_bc_ptr[i] ) { |
| SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); |
| SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); |
| } |
| SUPERLU_FREE (Llu->Lrowind_bc_ptr); |
| SUPERLU_FREE (Llu->Lnzval_bc_ptr); |
| nb = CEILING(nsupers, grid->nprow); |
| for (i = 0; i < nb; ++i) |
| if ( Llu->Ufstnz_br_ptr[i] ) { |
| SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]); |
| SUPERLU_FREE (Llu->Unzval_br_ptr[i]); |
| } |
| SUPERLU_FREE (Llu->Ufstnz_br_ptr); |
| SUPERLU_FREE (Llu->Unzval_br_ptr); |
| /* The following can be freed after factorization. */ |
| SUPERLU_FREE(Llu->ToRecv); |
| SUPERLU_FREE(Llu->ToSendD); |
| SUPERLU_FREE(Llu->ToSendR[0]); |
| SUPERLU_FREE(Llu->ToSendR); |
| /* The following can be freed only after iterative refinement. */ |
| SUPERLU_FREE(Llu->ilsum); |
| SUPERLU_FREE(Llu->fmod); |
| SUPERLU_FREE(Llu->fsendx_plist[0]); |
| SUPERLU_FREE(Llu->fsendx_plist); |
| SUPERLU_FREE(Llu->bmod); |
| SUPERLU_FREE(Llu->bsendx_plist[0]); |
| SUPERLU_FREE(Llu->bsendx_plist); |
| SUPERLU_FREE(Llu->mod_bit); |
| SUPERLU_FREE(Glu_persist->xsup); |
| SUPERLU_FREE(Glu_persist->supno); |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Exit Destroy_LU()"); |
| #endif |
| } |
| /*! \brief Allocate storage in ScalePermstruct */ |
| void ScalePermstructInit(const int_t m, const int_t n, |
| ScalePermstruct_t *ScalePermstruct) |
| { |
| ScalePermstruct->DiagScale = NOEQUIL; |
| if ( !(ScalePermstruct->perm_r = intMalloc_dist(m)) ) |
| ABORT("Malloc fails for perm_r[]."); |
| if ( !(ScalePermstruct->perm_c = intMalloc_dist(n)) ) |
| ABORT("Malloc fails for perm_c[]."); |
| } |
| /*! \brief Deallocate ScalePermstruct */ |
| void ScalePermstructFree(ScalePermstruct_t *ScalePermstruct) |
| { |
| SUPERLU_FREE(ScalePermstruct->perm_r); |
| SUPERLU_FREE(ScalePermstruct->perm_c); |
| switch ( ScalePermstruct->DiagScale ) { |
| case ROW: |
| SUPERLU_FREE(ScalePermstruct->R); |
| break; |
| case COL: |
| SUPERLU_FREE(ScalePermstruct->C); |
| break; |
| case BOTH: |
| SUPERLU_FREE(ScalePermstruct->R); |
| SUPERLU_FREE(ScalePermstruct->C); |
| break; |
| } |
| } |
| /*! \brief Allocate storage in LUstruct */ |
| void LUstructInit(const int_t m, const int_t n, LUstruct_t *LUstruct) |
| { |
| if ( !(LUstruct->etree = intMalloc_dist(n)) ) |
| ABORT("Malloc fails for etree[]."); |
| if ( !(LUstruct->Glu_persist = (Glu_persist_t *) |
| SUPERLU_MALLOC(sizeof(Glu_persist_t))) ) |
| ABORT("Malloc fails for Glu_persist_t."); |
| if ( !(LUstruct->Llu = (LocalLU_t *) |
| SUPERLU_MALLOC(sizeof(LocalLU_t))) ) |
| ABORT("Malloc fails for LocalLU_t."); |
| } |
| /*! \brief Deallocate LUstruct */ |
| void LUstructFree(LUstruct_t *LUstruct) |
| { |
| #if ( DEBUGlevel>=1 ) |
| int iam; |
| MPI_Comm_rank( MPI_COMM_WORLD, &iam ); |
| CHECK_MALLOC(iam, "Enter LUstructFree()"); |
| #endif |
| SUPERLU_FREE(LUstruct->etree); |
| SUPERLU_FREE(LUstruct->Glu_persist); |
| SUPERLU_FREE(LUstruct->Llu); |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Exit LUstructFree()"); |
| #endif |
| } |
| /*! \brief |
| * |
| * <pre> |
| * Count the total number of nonzeros in factors L and U, and in the |
| * symmetrically reduced L. |
| * </pre> |
| */ |
| void |
| countnz_dist(const int_t n, int_t *xprune, int_t *nnzL, int_t *nnzU, |
| Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable) |
| { |
| int_t fnz, fsupc, i, j, nsuper; |
| int_t nnzL0, jlen, irep; |
| int_t *supno, *xsup, *xlsub, *xusub, *usub; |
| supno = Glu_persist->supno; |
| xsup = Glu_persist->xsup; |
| xlsub = Glu_freeable->xlsub; |
| xusub = Glu_freeable->xusub; |
| usub = Glu_freeable->usub; |
| *nnzL = 0; |
| *nnzU = 0; |
| nnzL0 = 0; |
| nsuper = supno[n]; |
| if ( n <= 0 ) return; |
| /* |
| * For each supernode in L. |
| */ |
| for (i = 0; i <= nsuper; i++) { |
| fsupc = xsup[i]; |
| jlen = xlsub[fsupc+1] - xlsub[fsupc]; |
| for (j = fsupc; j < xsup[i+1]; j++) { |
| *nnzL += jlen; |
| *nnzU += j - fsupc + 1; |
| jlen--; |
| } |
| irep = xsup[i+1] - 1; |
| nnzL0 += xprune[irep] - xlsub[irep]; |
| } |
| /* printf("\tNo of nonzeros in symm-reduced L = %d\n", nnzL0);*/ |
| /* For each column in U. */ |
| for (j = 0; j < n; ++j) { |
| for (i = xusub[j]; i < xusub[j+1]; ++i) { |
| fnz = usub[i]; |
| fsupc = xsup[supno[fnz]+1]; |
| *nnzU += fsupc - fnz; |
| } |
| } |
| } |
| /*! \brief |
| * |
| * <pre> |
| * Fix up the data storage lsub for L-subscripts. It removes the subscript |
| * sets for structural pruning, and applies permuation to the remaining |
| * subscripts. |
| * </pre> |
| */ |
| int_t |
| fixupL_dist(const int_t n, const int_t *perm_r, |
| Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable) |
| { |
| register int_t nsuper, fsupc, nextl, i, j, k, jstrt, lsub_size; |
| int_t *xsup, *lsub, *xlsub; |
| if ( n <= 1 ) return 0; |
| xsup = Glu_persist->xsup; |
| lsub = Glu_freeable->lsub; |
| xlsub = Glu_freeable->xlsub; |
| nextl = 0; |
| nsuper = (Glu_persist->supno)[n]; |
| lsub_size = xlsub[n]; |
| /* |
| * For each supernode ... |
| */ |
| for (i = 0; i <= nsuper; i++) { |
| fsupc = xsup[i]; |
| jstrt = xlsub[fsupc]; |
| xlsub[fsupc] = nextl; |
| for (j = jstrt; j < xlsub[fsupc+1]; j++) { |
| lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */ |
| nextl++; |
| } |
| for (k = fsupc+1; k < xsup[i+1]; k++) |
| xlsub[k] = nextl; /* Other columns in supernode i */ |
| } |
| xlsub[n] = nextl; |
| return lsub_size; |
| } |
| /*! \brief Set the default values for the options argument. |
| */ |
| void set_default_options_dist(superlu_options_t *options) |
| { |
| options->Fact = DOFACT; |
| options->Equil = YES; |
| options->ParSymbFact = NO; |
| options->ColPerm = METIS_AT_PLUS_A; |
| options->RowPerm = LargeDiag; |
| options->ReplaceTinyPivot = YES; |
| options->IterRefine = DOUBLE; |
| options->Trans = NOTRANS; |
| options->SolveInitialized = NO; |
| options->RefineInitialized = NO; |
| options->PrintStat = YES; |
| } |
| /*! \brief Print the options setting. |
| */ |
| void print_options_dist(superlu_options_t *options) |
| { |
| printf(".. options:\n"); |
| printf("\tFact\t %8d\n", options->Fact); |
| printf("\tEquil\t %8d\n", options->Equil); |
| printf("\tColPerm\t %8d\n", options->ColPerm); |
| printf("\tRowPerm\t %8d\n", options->RowPerm); |
| printf("\tReplaceTinyPivot %4d\n", options->ReplaceTinyPivot); |
| printf("\tTrans\t %8d\n", options->Trans); |
| printf("\tIterRefine\t%4d\n", options->IterRefine); |
| printf("..\n"); |
| } |
| /*! \brief |
| * |
| * <pre> |
| * Purpose |
| * ======= |
| * Set up the communication pattern for the triangular solution. |
| * |
| * Arguments |
| * ========= |
| * |
| * n (input) int (global) |
| * The dimension of the linear system. |
| * |
| * m_loc (input) int (local) |
| * The local row dimension of the distributed input matrix. |
| * |
| * nrhs (input) int (global) |
| * Number of right-hand sides. |
| * |
| * fst_row (input) int (global) |
| * The row number of matrix B's first row in the global matrix. |
| * |
| * perm_r (input) int* (global) |
| * The row permutation vector. |
| * |
| * perm_c (input) int* (global) |
| * The column permutation vector. |
| * |
| * grid (input) gridinfo_t* |
| * The 2D process mesh. |
| * </pre> |
| */ |
| int_t |
| pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, |
| int_t perm_r[], int_t perm_c[], gridinfo_t *grid, |
| Glu_persist_t *Glu_persist, SOLVEstruct_t *SOLVEstruct) |
| { |
| int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; |
| int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; |
| int *itemp, *ptr_to_ibuf, *ptr_to_dbuf; |
| int_t *row_to_proc; |
| int_t i, gbi, k, l, num_diag_procs, *diag_procs; |
| int_t irow, lk, q, knsupc, nsupers, *xsup, *supno; |
| int iam, p, pkk, procs; |
| pxgstrs_comm_t *gstrs_comm; |
| procs = grid->nprow * grid->npcol; |
| iam = grid->iam; |
| gstrs_comm = SOLVEstruct->gstrs_comm; |
| xsup = Glu_persist->xsup; |
| supno = Glu_persist->supno; |
| nsupers = Glu_persist->supno[n-1] + 1; |
| row_to_proc = SOLVEstruct->row_to_proc; |
| /* ------------------------------------------------------------ |
| SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X. |
| ------------------------------------------------------------*/ |
| if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) |
| ABORT("Malloc fails for B_to_X_itemp[]."); |
| SendCnt = itemp; |
| SendCnt_nrhs = itemp + procs; |
| RecvCnt = itemp + 2*procs; |
| RecvCnt_nrhs = itemp + 3*procs; |
| sdispls = itemp + 4*procs; |
| sdispls_nrhs = itemp + 5*procs; |
| rdispls = itemp + 6*procs; |
| rdispls_nrhs = itemp + 7*procs; |
| /* Count the number of elements to be sent to each diagonal process.*/ |
| for (p = 0; p < procs; ++p) SendCnt[p] = 0; |
| for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { |
| irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ |
| gbi = BlockNum( irow ); |
| p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ |
| ++SendCnt[p]; |
| } |
| /* Set up the displacements for alltoall. */ |
| MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); |
| sdispls[0] = rdispls[0] = 0; |
| for (p = 1; p < procs; ++p) { |
| sdispls[p] = sdispls[p-1] + SendCnt[p-1]; |
| rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; |
| } |
| for (p = 0; p < procs; ++p) { |
| SendCnt_nrhs[p] = SendCnt[p] * nrhs; |
| sdispls_nrhs[p] = sdispls[p] * nrhs; |
| RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; |
| rdispls_nrhs[p] = rdispls[p] * nrhs; |
| } |
| /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ |
| gstrs_comm->B_to_X_SendCnt = SendCnt; |
| /* ------------------------------------------------------------ |
| SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B. |
| ------------------------------------------------------------*/ |
| /* This is freed in pxgstrs_finalize(). */ |
| if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) |
| ABORT("Malloc fails for X_to_B_itemp[]."); |
| SendCnt = itemp; |
| SendCnt_nrhs = itemp + procs; |
| RecvCnt = itemp + 2*procs; |
| RecvCnt_nrhs = itemp + 3*procs; |
| sdispls = itemp + 4*procs; |
| sdispls_nrhs = itemp + 5*procs; |
| rdispls = itemp + 6*procs; |
| rdispls_nrhs = itemp + 7*procs; |
| /* Count the number of X entries to be sent to each process.*/ |
| for (p = 0; p < procs; ++p) SendCnt[p] = 0; |
| num_diag_procs = SOLVEstruct->num_diag_procs; |
| diag_procs = SOLVEstruct->diag_procs; |
| for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */ |
| pkk = diag_procs[p]; |
| if ( iam == pkk ) { |
| for (k = p; k < nsupers; k += num_diag_procs) { |
| knsupc = SuperSize( k ); |
| lk = LBi( k, grid ); /* Local block number */ |
| irow = FstBlockC( k ); |
| for (i = 0; i < knsupc; ++i) { |
| #if 0 |
| q = row_to_proc[inv_perm_c[irow]]; |
| #else |
| q = row_to_proc[irow]; |
| #endif |
| ++SendCnt[q]; |
| ++irow; |
| } |
| } |
| } |
| } |
| MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); |
| sdispls[0] = rdispls[0] = 0; |
| sdispls_nrhs[0] = rdispls_nrhs[0] = 0; |
| SendCnt_nrhs[0] = SendCnt[0] * nrhs; |
| RecvCnt_nrhs[0] = RecvCnt[0] * nrhs; |
| for (p = 1; p < procs; ++p) { |
| sdispls[p] = sdispls[p-1] + SendCnt[p-1]; |
| rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; |
| sdispls_nrhs[p] = sdispls[p] * nrhs; |
| rdispls_nrhs[p] = rdispls[p] * nrhs; |
| SendCnt_nrhs[p] = SendCnt[p] * nrhs; |
| RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; |
| } |
| /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ |
| gstrs_comm->X_to_B_SendCnt = SendCnt; |
| if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) ) |
| ABORT("Malloc fails for ptr_to_ibuf[]."); |
| gstrs_comm->ptr_to_ibuf = ptr_to_ibuf; |
| gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs; |
| } /* PXGSTRS_INIT */ |
| void pxgstrs_finalize(pxgstrs_comm_t *gstrs_comm) |
| { |
| SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt); |
| SUPERLU_FREE(gstrs_comm->X_to_B_SendCnt); |
| SUPERLU_FREE(gstrs_comm->ptr_to_ibuf); |
| SUPERLU_FREE(gstrs_comm); |
| } |
| /*! \brief Diagnostic print of segment info after panel_dfs(). |
| */ |
| void print_panel_seg_dist(int_t n, int_t w, int_t jcol, int_t nseg, |
| int_t *segrep, int_t *repfnz) |
| { |
| int_t j, k; |
| for (j = jcol; j < jcol+w; j++) { |
| printf("\tcol %d:\n", j); |
| for (k = 0; k < nseg; k++) |
| printf("\t\tseg %d, segrep %d, repfnz %d\n", k, |
| segrep[k], repfnz[(j-jcol)*n + segrep[k]]); |
| } |
| } |
| void |
| PStatInit(SuperLUStat_t *stat) |
| { |
| register int_t i; |
| if ( !(stat->utime = SUPERLU_MALLOC(NPHASES*sizeof(double))) ) |
| ABORT("Malloc fails for stat->utime[]"); |
| if ( !(stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t))) ) |
| ABORT("SUPERLU_MALLOC fails for stat->ops[]"); |
| for (i = 0; i < NPHASES; ++i) { |
| stat->utime[i] = 0.; |
| stat->ops[i] = 0.; |
| } |
| stat->TinyPivots = stat->RefineSteps = 0; |
| } |
| void |
| PStatPrint(superlu_options_t *options, SuperLUStat_t *stat, gridinfo_t *grid) |
| { |
| double *utime = stat->utime; |
| flops_t *ops = stat->ops; |
| int_t iam = grid->iam; |
| flops_t flopcnt, factflop, solveflop; |
| if ( options->PrintStat == NO ) return; |
| if ( !iam && options->Fact != FACTORED ) { |
| if ( options->Equil != NO ) |
| printf("\tEQUIL time %8.2f\n", utime[EQUIL]); |
| if ( options->RowPerm != NOROWPERM ) |
| printf("\tROWPERM time %8.2f\n", utime[ROWPERM]); |
| if ( options->ColPerm != NATURAL ) |
| printf("\tCOLPERM time %8.2f\n", utime[COLPERM]); |
| printf("\tSYMBFACT time %8.2f\n", utime[SYMBFAC]); |
| printf("\tDISTRIBUTE time %8.2f\n", utime[DIST]); |
| } |
| MPI_Reduce(&ops[FACT], &flopcnt, 1, MPI_FLOAT, MPI_SUM, |
| 0, grid->comm); |
| factflop = flopcnt; |
| if ( !iam && options->Fact != FACTORED ) { |
| printf("\tFACTOR time %8.2f\n", utime[FACT]); |
| if ( utime[FACT] != 0.0 ) |
| printf("\tFactor flops\t%e\tMflops \t%8.2f\n", |
| flopcnt, |
| flopcnt*1e-6/utime[FACT]); |
| } |
| MPI_Reduce(&ops[SOLVE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, |
| 0, grid->comm); |
| solveflop = flopcnt; |
| if ( !iam ) { |
| printf("\tSOLVE time %8.2f\n", utime[SOLVE]); |
| if ( utime[SOLVE] != 0.0 ) |
| printf("\tSolve flops\t%e\tMflops \t%8.2f\n", |
| flopcnt, |
| flopcnt*1e-6/utime[SOLVE]); |
| } |
| if ( !iam && options->IterRefine != NOREFINE ) { |
| printf("\tREFINEMENT time %8.2f\tSteps%8d\n\n", |
| utime[REFINE], stat->RefineSteps); |
| } |
| #if ( PROFlevel>=1 ) |
| fflush(stdout); |
| MPI_Barrier( grid->comm ); |
| { |
| int_t i, P = grid->nprow*grid->npcol; |
| flops_t b, maxflop; |
| if ( !iam ) printf("\n.. FACT time breakdown:\tcomm\ttotal\n"); |
| for (i = 0; i < P; ++i) { |
| if ( iam == i) { |
| printf("\t\t(%d)%8.2f%8.2f\n", iam, utime[COMM], utime[FACT]); |
| fflush(stdout); |
| } |
| MPI_Barrier( grid->comm ); |
| } |
| if ( !iam ) printf("\n.. FACT ops distribution:\n"); |
| for (i = 0; i < P; ++i) { |
| if ( iam == i ) { |
| printf("\t\t(%d)\t%e\n", iam, ops[FACT]); |
| fflush(stdout); |
| } |
| MPI_Barrier( grid->comm ); |
| } |
| MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); |
| if ( !iam ) { |
| b = factflop/P/maxflop; |
| printf("\tFACT load balance: %.2f\n", b); |
| } |
| if ( !iam ) printf("\n.. SOLVE ops distribution:\n"); |
| for (i = 0; i < P; ++i) { |
| if ( iam == i ) { |
| printf("\t\t%d\t%e\n", iam, ops[SOLVE]); |
| fflush(stdout); |
| } |
| MPI_Barrier( grid->comm ); |
| } |
| MPI_Reduce(&ops[SOLVE], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0,grid->comm); |
| if ( !iam ) { |
| b = solveflop/P/maxflop; |
| printf("\tSOLVE load balance: %.2f\n", b); |
| } |
| } |
| #endif |
| /* if ( !iam ) fflush(stdout); CRASH THE SYSTEM pierre. */ |
| } |
| void |
| PStatFree(SuperLUStat_t *stat) |
| { |
| SUPERLU_FREE(stat->utime); |
| SUPERLU_FREE(stat->ops); |
| } |
| /*! \brief Fills an integer array with a given value. |
| */ |
| void ifill_dist(int_t *a, int_t alen, int_t ival) |
| { |
| register int_t i; |
| for (i = 0; i < alen; i++) a[i] = ival; |
| } |
| void |
| get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid, |
| int_t *num_diag_procs, int_t **diag_procs, int_t **diag_len) |
| { |
| int_t i, j, k, knsupc, nprow, npcol, nsupers, pkk; |
| int_t *xsup; |
| i = j = *num_diag_procs = pkk = 0; |
| nprow = grid->nprow; |
| npcol = grid->npcol; |
| nsupers = Glu_persist->supno[n-1] + 1; |
| xsup = Glu_persist->xsup; |
| do { |
| ++(*num_diag_procs); |
| i = (++i) % nprow; |
| j = (++j) % npcol; |
| pkk = PNUM( i, j, grid ); |
| } while ( pkk != 0 ); /* Until wrap back to process 0 */ |
| if ( !(*diag_procs = intMalloc_dist(*num_diag_procs)) ) |
| ABORT("Malloc fails for diag_procs[]"); |
| if ( !(*diag_len = intCalloc_dist(*num_diag_procs)) ) |
| ABORT("Calloc fails for diag_len[]"); |
| for (i = j = k = 0; k < *num_diag_procs; ++k) { |
| pkk = PNUM( i, j, grid ); |
| (*diag_procs)[k] = pkk; |
| i = (++i) % nprow; |
| j = (++j) % npcol; |
| } |
| for (k = 0; k < nsupers; ++k) { |
| knsupc = SuperSize( k ); |
| i = k % *num_diag_procs; |
| (*diag_len)[i] += knsupc; |
| } |
| } |
| /*! \brief Get the statistics of the supernodes |
| */ |
| #define NBUCKS 10 |
| static int_t max_sup_size; |
| void super_stats_dist(int_t nsuper, int_t *xsup) |
| { |
| register int_t nsup1 = 0; |
| int_t i, isize, whichb, bl, bh; |
| int_t bucket[NBUCKS]; |
| max_sup_size = 0; |
| for (i = 0; i <= nsuper; i++) { |
| isize = xsup[i+1] - xsup[i]; |
| if ( isize == 1 ) nsup1++; |
| if ( max_sup_size < isize ) max_sup_size = isize; |
| } |
| printf(" Supernode statistics:\n\tno of super = %d\n", nsuper+1); |
| printf("\tmax supernode size = %d\n", max_sup_size); |
| printf("\tno of size 1 supernodes = %d\n", nsup1); |
| /* Histogram of the supernode sizes */ |
| ifill_dist (bucket, NBUCKS, 0); |
| for (i = 0; i <= nsuper; i++) { |
| isize = xsup[i+1] - xsup[i]; |
| whichb = (float) isize / max_sup_size * NBUCKS; |
| if (whichb >= NBUCKS) whichb = NBUCKS - 1; |
| bucket[whichb]++; |
| } |
| printf("\tHistogram of supernode sizes:\n"); |
| for (i = 0; i < NBUCKS; i++) { |
| bl = (float) i * max_sup_size / NBUCKS; |
| bh = (float) (i+1) * max_sup_size / NBUCKS; |
| printf("\tsnode: %d-%d\t\t%d\n", bl+1, bh, bucket[i]); |
| } |
| } |
| /*! \brief Check whether repfnz[] == EMPTY after reset. |
| */ |
| void check_repfnz_dist(int_t n, int_t w, int_t jcol, int_t *repfnz) |
| { |
| int_t jj, k; |
| for (jj = jcol; jj < jcol+w; jj++) |
| for (k = 0; k < n; k++) |
| if ( repfnz[(jj-jcol)*n + k] != EMPTY ) { |
| fprintf(stderr, "col %d, repfnz_col[%d] = %d\n", jj, |
| k, repfnz[(jj-jcol)*n + k]); |
| ABORT("check_repfnz_dist"); |
| } |
| } |
| void PrintInt10(char *name, int_t len, int_t *x) |
| { |
| register int_t i; |
| printf("%10s:", name); |
| for (i = 0; i < len; ++i) { |
| if ( i % 10 == 0 ) printf("\n\t[%2d-%2d]", i, i+9); |
| printf("%6d", x[i]); |
| } |
| printf("\n"); |
| } |
| int file_PrintInt10(FILE *fp, char *name, int_t len, int_t *x) |
| { |
| register int_t i; |
| fprintf(fp, "%10s:", name); |
| for (i = 0; i < len; ++i) { |
| if ( i % 10 == 0 ) fprintf(fp, "\n\t[%2d-%2d]", i, i+9); |
| fprintf(fp, "%6d", x[i]); |
| } |
| fprintf(fp, "\n"); |
| } |
| int_t |
| CheckZeroDiagonal(int_t n, int_t *rowind, int_t *colbeg, int_t *colcnt) |
| { |
| register int_t i, j, zd, numzd = 0; |
| for (j = 0; j < n; ++j) { |
| zd = 0; |
| for (i = colbeg[j]; i < colbeg[j]+colcnt[j]; ++i) { |
| /*if ( iperm[rowind[i]] == j ) zd = 1;*/ |
| if ( rowind[i] == j ) { zd = 1; break; } |
| } |
| if ( zd == 0 ) { |
| #if ( PRNTlevel>=2 ) |
| printf(".. Diagonal of column %d is zero.\n", j); |
| #endif |
| ++numzd; |
| } |
| } |
| return numzd; |
| } |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/mc64ad.c |
|---|
| New file |
| 0,0 → 1,2641 |
| /* mc64ad.f -- translated by f2c (version 20100827). |
| You must link the resulting object file with libf2c: |
| on Microsoft Windows system, link with libf2c.lib; |
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm |
| or, if you install libf2c.a in a standard place, with -lf2c -lm |
| -- in that order, at the end of the command line, as in |
| cc *.o -lf2c -lm |
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., |
| http://www.netlib.org/f2c/libf2c.zip |
| */ |
| #include "superlu_ddefs.h" |
| #define abs(x) ((x) >= 0 ? (x) : -(x)) |
| #define min(a,b) ((a) < (b)) ? (a) : (b) |
| /* Table of constant values */ |
| static int_t c__1 = 1; |
| static int_t c__2 = 2; |
| /* CCCC COPYRIGHT (c) 1999 Council for the Central Laboratory of the */ |
| /* CCCC Research Councils. All rights reserved. */ |
| /* CCCC PACKAGE MC64A/AD */ |
| /* CCCC AUTHORS Iain Duff (i.duff@rl.ac.uk) and Jacko Koster (jak@ii.uib.no) */ |
| /* CCCC LAST UPDATE 20/09/99 */ |
| /* CCCC */ |
| /* *** Conditions on external use *** */ |
| /* The user shall acknowledge the contribution of this */ |
| /* package in any publication of material dependent upon the use of */ |
| /* the package. The user shall use reasonable endeavours to notify */ |
| /* the authors of the package of this publication. */ |
| /* The user can modify this code but, at no time */ |
| /* shall the right or title to all or any part of this package pass */ |
| /* to the user. The user shall make available free of charge */ |
| /* to the authors for any purpose all information relating to any */ |
| /* alteration or addition made to this package for the purposes of */ |
| /* extending the capabilities or enhancing the performance of this */ |
| /* package. */ |
| /* The user shall not pass this code directly to a third party without the */ |
| /* express prior consent of the authors. Users wanting to licence their */ |
| /* own copy of these routines should send email to hsl@aeat.co.uk */ |
| /* None of the comments from the Copyright notice up to and including this */ |
| /* one shall be removed or altered in any way. */ |
| /* ********************************************************************** */ |
| /* Subroutine */ int_t mc64id_(int_t *icntl) |
| { |
| int_t i__; |
| /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ |
| /* Research Councils *** */ |
| /* *** Although every effort has been made to ensure robustness and *** */ |
| /* *** reliability of the subroutines in this MC64 suite, we *** */ |
| /* *** disclaim any liability arising through the use or misuse of *** */ |
| /* *** any of the subroutines. *** */ |
| /* *** Any problems? Contact ... */ |
| /* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ |
| /* Purpose */ |
| /* ======= */ |
| /* The components of the array ICNTL control the action of MC64A/AD. */ |
| /* Default values for these are set in this subroutine. */ |
| /* Parameters */ |
| /* ========== */ |
| /* Local variables */ |
| /* ICNTL(1) has default value 6. */ |
| /* It is the output stream for error messages. If it */ |
| /* is negative, these messages will be suppressed. */ |
| /* ICNTL(2) has default value 6. */ |
| /* It is the output stream for warning messages. */ |
| /* If it is negative, these messages are suppressed. */ |
| /* ICNTL(3) has default value -1. */ |
| /* It is the output stream for monitoring printing. */ |
| /* If it is negative, these messages are suppressed. */ |
| /* ICNTL(4) has default value 0. */ |
| /* If left at the defaut value, the incoming data is checked for */ |
| /* out-of-range indices and duplicates. Setting ICNTL(4) to any */ |
| /* other will avoid the checks but is likely to cause problems */ |
| /* later if out-of-range indices or duplicates are present. */ |
| /* The user should only set ICNTL(4) non-zero, if the data is */ |
| /* known to avoid these problems. */ |
| /* ICNTL(5) to ICNTL(10) are not used by MC64A/AD but are set to */ |
| /* zero in this routine. */ |
| /* Initialization of the ICNTL array. */ |
| /* Parameter adjustments */ |
| --icntl; |
| /* Function Body */ |
| icntl[1] = 6; |
| icntl[2] = 6; |
| icntl[3] = -1; |
| for (i__ = 4; i__ <= 10; ++i__) { |
| icntl[i__] = 0; |
| /* L10: */ |
| } |
| return 0; |
| } /* mc64id_ */ |
| /* ********************************************************************** */ |
| /* Subroutine */ int_t mc64ad_(int_t *job, int_t *n, int_t *ne, int_t * |
| ip, int_t *irn, double *a, int_t *num, int_t *cperm, |
| int_t *liw, int_t *iw, int_t *ldw, double *dw, int_t * |
| icntl, int_t *info) |
| { |
| /* System generated locals */ |
| int_t i__1, i__2; |
| double d__1, d__2; |
| /* Builtin functions */ |
| double log(double); |
| /* Local variables */ |
| int_t i__, j, k; |
| double fact, rinf; |
| extern /* Subroutine */ int_t mc21ad_(int_t *, int_t *, int_t *, |
| int_t *, int_t *, int_t *, int_t *, int_t *), mc64bd_( |
| int_t *, int_t *, int_t *, int_t *, double *, int_t |
| *, int_t *, int_t *, int_t *, int_t *, int_t *, |
| double *), mc64rd_(int_t *, int_t *, int_t *, int_t *, |
| double *), mc64sd_(int_t *, int_t *, int_t *, int_t * |
| , double *, int_t *, int_t *, int_t *, int_t *, |
| int_t *, int_t *, int_t *, int_t *, int_t *), mc64wd_( |
| int_t *, int_t *, int_t *, int_t *, double *, int_t |
| *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t |
| *, double *, double *); |
| /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ |
| /* Research Councils *** */ |
| /* *** Although every effort has been made to ensure robustness and *** */ |
| /* *** reliability of the subroutines in this MC64 suite, we *** */ |
| /* *** disclaim any liability arising through the use or misuse of *** */ |
| /* *** any of the subroutines. *** */ |
| /* *** Any problems? Contact ... */ |
| /* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ |
| /* Purpose */ |
| /* ======= */ |
| /* This subroutine attempts to find a column permutation for an NxN */ |
| /* sparse matrix A = {a_ij} that makes the permuted matrix have N */ |
| /* entries on its diagonal. */ |
| /* If the matrix is structurally nonsingular, the subroutine optionally */ |
| /* returns a column permutation that maximizes the smallest element */ |
| /* on the diagonal, maximizes the sum of the diagonal entries, or */ |
| /* maximizes the product of the diagonal entries of the permuted matrix. */ |
| /* For the latter option, the subroutine also finds scaling factors */ |
| /* that may be used to scale the matrix so that the nonzero diagonal */ |
| /* entries of the permuted matrix are one in absolute value and all the */ |
| /* off-diagonal entries are less than or equal to one in absolute value. */ |
| /* The natural logarithms of the scaling factors u(i), i=1..N, for the */ |
| /* rows and v(j), j=1..N, for the columns are returned so that the */ |
| /* scaled matrix B = {b_ij} has entries b_ij = a_ij * EXP(u_i + v_j). */ |
| /* Parameters */ |
| /* ========== */ |
| /* JOB is an INT_T variable which must be set by the user to */ |
| /* control the action. It is not altered by the subroutine. */ |
| /* Possible values for JOB are: */ |
| /* 1 Compute a column permutation of the matrix so that the */ |
| /* permuted matrix has as many entries on its diagonal as possible. */ |
| /* The values on the diagonal are of arbitrary size. HSL subroutine */ |
| /* MC21A/AD is used for this. See [1]. */ |
| /* 2 Compute a column permutation of the matrix so that the smallest */ |
| /* value on the diagonal of the permuted matrix is maximized. */ |
| /* See [3]. */ |
| /* 3 Compute a column permutation of the matrix so that the smallest */ |
| /* value on the diagonal of the permuted matrix is maximized. */ |
| /* The algorithm differs from the one used for JOB = 2 and may */ |
| /* have quite a different performance. See [2]. */ |
| /* 4 Compute a column permutation of the matrix so that the sum */ |
| /* of the diagonal entries of the permuted matrix is maximized. */ |
| /* See [3]. */ |
| /* 5 Compute a column permutation of the matrix so that the product */ |
| /* of the diagonal entries of the permuted matrix is maximized */ |
| /* and vectors to scale the matrix so that the nonzero diagonal */ |
| /* entries of the permuted matrix are one in absolute value and */ |
| /* all the off-diagonal entries are less than or equal to one in */ |
| /* absolute value. See [3]. */ |
| /* Restriction: 1 <= JOB <= 5. */ |
| /* N is an INT_T variable which must be set by the user to the */ |
| /* order of the matrix A. It is not altered by the subroutine. */ |
| /* Restriction: N >= 1. */ |
| /* NE is an INT_T variable which must be set by the user to the */ |
| /* number of entries in the matrix. It is not altered by the */ |
| /* subroutine. */ |
| /* Restriction: NE >= 1. */ |
| /* IP is an INT_T array of length N+1. */ |
| /* IP(J), J=1..N, must be set by the user to the position in array IRN */ |
| /* of the first row index of an entry in column J. IP(N+1) must be set */ |
| /* to NE+1. It is not altered by the subroutine. */ |
| /* IRN is an INT_T array of length NE. */ |
| /* IRN(K), K=1..NE, must be set by the user to hold the row indices of */ |
| /* the entries of the matrix. Those belonging to column J must be */ |
| /* stored contiguously in the positions IP(J)..IP(J+1)-1. The ordering */ |
| /* of the row indices within each column is unimportant. Repeated */ |
| /* entries are not allowed. The array IRN is not altered by the */ |
| /* subroutine. */ |
| /* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */ |
| /* The user must set A(K), K=1..NE, to the numerical value of the */ |
| /* entry that corresponds to IRN(K). */ |
| /* It is not used by the subroutine when JOB = 1. */ |
| /* It is not altered by the subroutine. */ |
| /* NUM is an INT_T variable that need not be set by the user. */ |
| /* On successful exit, NUM will be the number of entries on the */ |
| /* diagonal of the permuted matrix. */ |
| /* If NUM < N, the matrix is structurally singular. */ |
| /* CPERM is an INT_T array of length N that need not be set by the */ |
| /* user. On successful exit, CPERM contains the column permutation. */ |
| /* Column CPERM(J) of the original matrix is column J in the permuted */ |
| /* matrix, J=1..N. */ |
| /* LIW is an INT_T variable that must be set by the user to */ |
| /* the dimension of array IW. It is not altered by the subroutine. */ |
| /* Restriction: */ |
| /* JOB = 1 : LIW >= 5N */ |
| /* JOB = 2 : LIW >= 4N */ |
| /* JOB = 3 : LIW >= 10N + NE */ |
| /* JOB = 4 : LIW >= 5N */ |
| /* JOB = 5 : LIW >= 5N */ |
| /* IW is an INT_T array of length LIW that is used for workspace. */ |
| /* LDW is an INT_T variable that must be set by the user to the */ |
| /* dimension of array DW. It is not altered by the subroutine. */ |
| /* Restriction: */ |
| /* JOB = 1 : LDW is not used */ |
| /* JOB = 2 : LDW >= N */ |
| /* JOB = 3 : LDW >= NE */ |
| /* JOB = 4 : LDW >= 2N + NE */ |
| /* JOB = 5 : LDW >= 3N + NE */ |
| /* DW is a REAL (DOUBLE PRECISION in the D-version) array of length LDW */ |
| /* that is used for workspace. If JOB = 5, on return, */ |
| /* DW(i) contains u_i, i=1..N, and DW(N+j) contains v_j, j=1..N. */ |
| /* ICNTL is an INT_T array of length 10. Its components control the */ |
| /* output of MC64A/AD and must be set by the user before calling */ |
| /* MC64A/AD. They are not altered by the subroutine. */ |
| /* ICNTL(1) must be set to specify the output stream for */ |
| /* error messages. If ICNTL(1) < 0, messages are suppressed. */ |
| /* The default value set by MC46I/ID is 6. */ |
| /* ICNTL(2) must be set by the user to specify the output stream for */ |
| /* warning messages. If ICNTL(2) < 0, messages are suppressed. */ |
| /* The default value set by MC46I/ID is 6. */ |
| /* ICNTL(3) must be set by the user to specify the output stream for */ |
| /* diagnostic messages. If ICNTL(3) < 0, messages are suppressed. */ |
| /* The default value set by MC46I/ID is -1. */ |
| /* ICNTL(4) must be set by the user to a value other than 0 to avoid */ |
| /* checking of the input data. */ |
| /* The default value set by MC46I/ID is 0. */ |
| /* INFO is an INT_T array of length 10 which need not be set by the */ |
| /* user. INFO(1) is set non-negative to indicate success. A negative */ |
| /* value is returned if an error occurred, a positive value if a */ |
| /* warning occurred. INFO(2) holds further information on the error. */ |
| /* On exit from the subroutine, INFO(1) will take one of the */ |
| /* following values: */ |
| /* 0 : successful entry (for structurally nonsingular matrix). */ |
| /* +1 : successful entry (for structurally singular matrix). */ |
| /* +2 : the returned scaling factors are large and may cause */ |
| /* overflow when used to scale the matrix. */ |
| /* (For JOB = 5 entry only.) */ |
| /* -1 : JOB < 1 or JOB > 5. Value of JOB held in INFO(2). */ |
| /* -2 : N < 1. Value of N held in INFO(2). */ |
| /* -3 : NE < 1. Value of NE held in INFO(2). */ |
| /* -4 : the defined length LIW violates the restriction on LIW. */ |
| /* Value of LIW required given by INFO(2). */ |
| /* -5 : the defined length LDW violates the restriction on LDW. */ |
| /* Value of LDW required given by INFO(2). */ |
| /* -6 : entries are found whose row indices are out of range. INFO(2) */ |
| /* contains the index of a column in which such an entry is found. */ |
| /* -7 : repeated entries are found. INFO(2) contains the index of a */ |
| /* column in which such entries are found. */ |
| /* INFO(3) to INFO(10) are not currently used and are set to zero by */ |
| /* the routine. */ |
| /* References: */ |
| /* [1] I. S. Duff, (1981), */ |
| /* "Algorithm 575. Permutations for a zero-free diagonal", */ |
| /* ACM Trans. Math. Software 7(3), 387-390. */ |
| /* [2] I. S. Duff and J. Koster, (1998), */ |
| /* "The design and use of algorithms for permuting large */ |
| /* entries to the diagonal of sparse matrices", */ |
| /* SIAM J. Matrix Anal. Appl., vol. 20, no. 4, pp. 889-901. */ |
| /* [3] I. S. Duff and J. Koster, (1999), */ |
| /* "On algorithms for permuting large entries to the diagonal */ |
| /* of sparse matrices", */ |
| /* Technical Report RAL-TR-1999-030, RAL, Oxfordshire, England. */ |
| /* Local variables and parameters */ |
| /* External routines and functions */ |
| /* EXTERNAL FD05AD */ |
| /* DOUBLE PRECISION FD05AD */ |
| /* Intrinsic functions */ |
| /* Set RINF to largest positive real number (infinity) */ |
| /* XSL RINF = FD05AD(5) */ |
| /* Parameter adjustments */ |
| --cperm; |
| --ip; |
| --a; |
| --irn; |
| --iw; |
| --dw; |
| --icntl; |
| --info; |
| /* Function Body */ |
| rinf = dlamch_("Overflow"); |
| /* Check value of JOB */ |
| if (*job < 1 || *job > 5) { |
| info[1] = -1; |
| info[2] = *job; |
| if (icntl[1] >= 0) { |
| printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], |
| " because JOB = %d\n", *job); |
| } |
| goto L99; |
| } |
| /* Check value of N */ |
| if (*n < 1) { |
| info[1] = -2; |
| info[2] = *n; |
| if (icntl[1] >= 0) { |
| printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], |
| " because N = %d\n", *job); |
| } |
| goto L99; |
| } |
| /* Check value of NE */ |
| if (*ne < 1) { |
| info[1] = -3; |
| info[2] = *ne; |
| if (icntl[1] >= 0) { |
| printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], |
| " because NE = %d\n", *job); |
| } |
| goto L99; |
| } |
| /* Check LIW */ |
| if (*job == 1) { |
| k = *n * 5; |
| } |
| if (*job == 2) { |
| k = *n << 2; |
| } |
| if (*job == 3) { |
| k = *n * 10 + *ne; |
| } |
| if (*job == 4) { |
| k = *n * 5; |
| } |
| if (*job == 5) { |
| k = *n * 5; |
| } |
| if (*liw < k) { |
| info[1] = -4; |
| info[2] = k; |
| if (icntl[1] >= 0) { |
| printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], |
| " LIW too small, must be at least %8d\n", k); |
| } |
| goto L99; |
| } |
| /* Check LDW */ |
| /* If JOB = 1, do not check */ |
| if (*job > 1) { |
| if (*job == 2) { |
| k = *n; |
| } |
| if (*job == 3) { |
| k = *ne; |
| } |
| if (*job == 4) { |
| k = (*n << 1) + *ne; |
| } |
| if (*job == 5) { |
| k = *n * 3 + *ne; |
| } |
| if (*ldw < k) { |
| info[1] = -5; |
| info[2] = k; |
| if (icntl[1] >= 0) { |
| printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], |
| " LDW too small, must be at least %8d\n", k); |
| } |
| goto L99; |
| } |
| } |
| if (icntl[4] == 0) { |
| /* Check row indices. Use IW(1:N) as workspace */ |
| i__1 = *n; |
| for (i__ = 1; i__ <= i__1; ++i__) { |
| iw[i__] = 0; |
| /* L3: */ |
| } |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| i__2 = ip[j + 1] - 1; |
| for (k = ip[j]; k <= i__2; ++k) { |
| i__ = irn[k]; |
| /* Check for row indices that are out of range */ |
| if (i__ < 1 || i__ > *n) { |
| info[1] = -6; |
| info[2] = j; |
| if (icntl[1] >= 0) { |
| printf(" ****** Error in MC64A/AD. INFO(1) = %2d", |
| info[1], " Column %8d", j, |
| " contains an entry with invalid row index %8d\n", i__); |
| } |
| goto L99; |
| } |
| /* Check for repeated row indices within a column */ |
| if (iw[i__] == j) { |
| info[1] = -7; |
| info[2] = j; |
| if (icntl[1] >= 0) { |
| printf(" ****** Error in MC64A/AD. INFO(1) = %2d", info[1], |
| " Column %8d", j, |
| " contains two or more entries with row index %8d\n", i__); |
| } |
| goto L99; |
| } else { |
| iw[i__] = j; |
| } |
| /* L4: */ |
| } |
| /* L6: */ |
| } |
| } |
| /* Print diagnostics on input */ |
| if (icntl[3] >= 0) { |
| printf(" ****** Input parameters for MC64A/AD: JOB = %8d," |
| " N = %d, NE = %8d\n", *job, *n, *ne); |
| printf(" IP(1:N+1) = "); |
| for (j=1; j<=(*n+1); ++j) { |
| printf("%8d", ip[j]); |
| if (j%8 == 0) printf("\n"); |
| } |
| printf("\n IRN(1:NE) = "); |
| for (j=1; j<=(*ne); ++j) { |
| printf("%8d", irn[j]); |
| if (j%8 == 0) printf("\n"); |
| } |
| printf("\n"); |
| if (*job > 1) { |
| printf(" A(1:NE) = "); |
| for (j=1; j<=(*ne); ++j) { |
| printf("%f14.4", a[j]); |
| if (j%4 == 0) printf("\n"); |
| } |
| printf("\n"); |
| } |
| } |
| /* Set components of INFO to zero */ |
| for (i__ = 1; i__ <= 10; ++i__) { |
| info[i__] = 0; |
| /* L8: */ |
| } |
| /* Compute maximum matching with MC21A/AD */ |
| if (*job == 1) { |
| /* Put length of column J in IW(J) */ |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| iw[j] = ip[j + 1] - ip[j]; |
| /* L10: */ |
| } |
| /* IW(N+1:5N) is workspace */ |
| #if 0 |
| mc21ad_(n, &irn[1], ne, &ip[1], &iw[1], &cperm[1], num, &iw[*n+1]); |
| #else |
| printf(" ****** Warning from MC64A/AD. Need to link mc21ad.\n"); |
| #endif |
| goto L90; |
| } |
| /* Compute bottleneck matching */ |
| if (*job == 2) { |
| /* IW(1:5N), DW(1:N) are workspaces */ |
| mc64bd_(n, ne, &ip[1], &irn[1], &a[1], &cperm[1], num, &iw[1], &iw[*n |
| + 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1], &dw[1]); |
| goto L90; |
| } |
| /* Compute bottleneck matching */ |
| if (*job == 3) { |
| /* Copy IRN(K) into IW(K), ABS(A(K)) into DW(K), K=1..NE */ |
| i__1 = *ne; |
| for (k = 1; k <= i__1; ++k) { |
| iw[k] = irn[k]; |
| dw[k] = (d__1 = a[k], abs(d__1)); |
| /* L20: */ |
| } |
| /* Sort entries in each column by decreasing value. */ |
| mc64rd_(n, ne, &ip[1], &iw[1], &dw[1]); |
| /* IW(NE+1:NE+10N) is workspace */ |
| mc64sd_(n, ne, &ip[1], &iw[1], &dw[1], &cperm[1], num, &iw[*ne + 1], & |
| iw[*ne + *n + 1], &iw[*ne + (*n << 1) + 1], &iw[*ne + *n * 3 |
| + 1], &iw[*ne + (*n << 2) + 1], &iw[*ne + *n * 5 + 1], &iw[* |
| ne + *n * 6 + 1]); |
| goto L90; |
| } |
| if (*job == 4) { |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| fact = 0.; |
| i__2 = ip[j + 1] - 1; |
| for (k = ip[j]; k <= i__2; ++k) { |
| if ((d__1 = a[k], abs(d__1)) > fact) { |
| fact = (d__2 = a[k], abs(d__2)); |
| } |
| /* L30: */ |
| } |
| i__2 = ip[j + 1] - 1; |
| for (k = ip[j]; k <= i__2; ++k) { |
| dw[(*n << 1) + k] = fact - (d__1 = a[k], abs(d__1)); |
| /* L40: */ |
| } |
| /* L50: */ |
| } |
| /* B = DW(2N+1:2N+NE); IW(1:5N) and DW(1:2N) are workspaces */ |
| mc64wd_(n, ne, &ip[1], &irn[1], &dw[(*n << 1) + 1], &cperm[1], num, & |
| iw[1], &iw[*n + 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1], &iw[( |
| *n << 2) + 1], &dw[1], &dw[*n + 1]); |
| goto L90; |
| } |
| if (*job == 5) { |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| fact = 0.; |
| i__2 = ip[j + 1] - 1; |
| for (k = ip[j]; k <= i__2; ++k) { |
| dw[*n * 3 + k] = (d__1 = a[k], abs(d__1)); |
| if (dw[*n * 3 + k] > fact) { |
| fact = dw[*n * 3 + k]; |
| } |
| /* L60: */ |
| } |
| dw[(*n << 1) + j] = fact; |
| if (fact != 0.) { |
| fact = log(fact); |
| } else { |
| fact = rinf / *n; |
| } |
| i__2 = ip[j + 1] - 1; |
| for (k = ip[j]; k <= i__2; ++k) { |
| if (dw[*n * 3 + k] != 0.) { |
| dw[*n * 3 + k] = fact - log(dw[*n * 3 + k]); |
| } else { |
| dw[*n * 3 + k] = rinf / *n; |
| } |
| /* L70: */ |
| } |
| /* L75: */ |
| } |
| /* B = DW(3N+1:3N+NE); IW(1:5N) and DW(1:2N) are workspaces */ |
| mc64wd_(n, ne, &ip[1], &irn[1], &dw[*n * 3 + 1], &cperm[1], num, &iw[ |
| 1], &iw[*n + 1], &iw[(*n << 1) + 1], &iw[*n * 3 + 1], &iw[(*n |
| << 2) + 1], &dw[1], &dw[*n + 1]); |
| if (*num == *n) { |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| if (dw[(*n << 1) + j] != 0.) { |
| dw[*n + j] -= log(dw[(*n << 1) + j]); |
| } else { |
| dw[*n + j] = 0.; |
| } |
| /* L80: */ |
| } |
| } |
| /* Check size of scaling factors */ |
| fact = log(rinf) * .5f; |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| if (dw[j] < fact && dw[*n + j] < fact) { |
| goto L86; |
| } |
| info[1] = 2; |
| goto L90; |
| L86: |
| ; |
| } |
| /* GO TO 90 */ |
| } |
| L90: |
| if (info[1] == 0 && *num < *n) { |
| /* Matrix is structurally singular, return with warning */ |
| info[1] = 1; |
| if (icntl[2] >= 0) { |
| printf(" ****** Warning from MC64A/AD. INFO(1) = %2d", info[1], |
| " The matrix is structurally singular.\n"); |
| } |
| } |
| if (info[1] == 2) { |
| /* Scaling factors are large, return with warning */ |
| if (icntl[2] >= 0) { |
| printf(" ****** Warning from MC64A/AD. INFO(1) = %2d\n", info[1], |
| " Some scaling factors may be too large.\n"); |
| } |
| } |
| /* Print diagnostics on output */ |
| if (icntl[3] >= 0) { |
| printf(" ****** Output parameters for MC64A/AD: INFO(1:2) = %8d%8d\n", |
| info[1], info[2]); |
| printf(" NUM = ", *num); |
| printf(" CPERM(1:N) = "); |
| for (j=1; j<=*n; ++j) { |
| printf("%8d", cperm[j]); |
| if (j%8 == 0) printf("\n"); |
| } |
| if (*job == 5) { |
| printf("\n DW(1:N) = "); |
| for (j=1; j<=*n; ++j) { |
| printf("%11.3f", dw[j]); |
| if (j%5 == 0) printf("\n"); |
| } |
| printf("\n DW(N+1:2N) = "); |
| for (j=1; j<=*n; ++j) { |
| printf("%11.3f", dw[*n+j]); |
| if (j%5 == 0) printf("\n"); |
| } |
| printf("\n"); |
| } |
| } |
| /* Return from subroutine. */ |
| L99: |
| return 0; |
| } /* mc64ad_ */ |
| /* ********************************************************************** */ |
| /* Subroutine */ int_t mc64bd_(int_t *n, int_t *ne, int_t *ip, int_t * |
| irn, double *a, int_t *iperm, int_t *num, int_t *jperm, |
| int_t *pr, int_t *q, int_t *l, double *d__) |
| { |
| /* System generated locals */ |
| int_t i__1, i__2, i__3; |
| double d__1, d__2, d__3; |
| /* Local variables */ |
| int_t i__, j, k; |
| double a0; |
| int_t i0, q0; |
| double ai, di; |
| int_t ii, jj, kk; |
| double bv; |
| int_t up; |
| double dq0; |
| int_t kk1, kk2; |
| double csp; |
| int_t isp, jsp, low; |
| double dnew; |
| int_t jord, qlen, idum, jdum; |
| double rinf; |
| extern /* Subroutine */ int_t mc64dd_(int_t *, int_t *, int_t *, |
| double *, int_t *, int_t *), mc64ed_(int_t *, int_t *, |
| int_t *, double *, int_t *, int_t *), mc64fd_(int_t * |
| , int_t *, int_t *, int_t *, double *, int_t *, int_t *); |
| /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ |
| /* Research Councils *** */ |
| /* *** Although every effort has been made to ensure robustness and *** */ |
| /* *** reliability of the subroutines in this MC64 suite, we *** */ |
| /* *** disclaim any liability arising through the use or misuse of *** */ |
| /* *** any of the subroutines. *** */ |
| /* *** Any problems? Contact ... */ |
| /* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ |
| /* N, NE, IP, IRN are described in MC64A/AD. */ |
| /* A is a REAL (DOUBLE PRECISION in the D-version) array of length */ |
| /* NE. A(K), K=1..NE, must be set to the value of the entry */ |
| /* that corresponds to IRN(K). It is not altered. */ |
| /* IPERM is an INT_T array of length N. On exit, it contains the */ |
| /* matching: IPERM(I) = 0 or row I is matched to column IPERM(I). */ |
| /* NUM is INT_T variable. On exit, it contains the cardinality of the */ |
| /* matching stored in IPERM. */ |
| /* IW is an INT_T work array of length 4N. */ |
| /* DW is a REAL (DOUBLE PRECISION in D-version) work array of length N. */ |
| /* Local variables */ |
| /* Local parameters */ |
| /* Intrinsic functions */ |
| /* External subroutines and/or functions */ |
| /* EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD, DLAMCH */ |
| /* DOUBLE PRECISION FD05AD, DLAMCH */ |
| /* Set RINF to largest positive real number */ |
| /* XSL RINF = FD05AD(5) */ |
| /* Parameter adjustments */ |
| --d__; |
| --l; |
| --q; |
| --pr; |
| --jperm; |
| --iperm; |
| --ip; |
| --a; |
| --irn; |
| /* Function Body */ |
| rinf = dlamch_("Overflow"); |
| /* Initialization */ |
| *num = 0; |
| bv = rinf; |
| i__1 = *n; |
| for (k = 1; k <= i__1; ++k) { |
| iperm[k] = 0; |
| jperm[k] = 0; |
| pr[k] = ip[k]; |
| d__[k] = 0.; |
| /* L10: */ |
| } |
| /* Scan columns of matrix; */ |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| a0 = -1.; |
| i__2 = ip[j + 1] - 1; |
| for (k = ip[j]; k <= i__2; ++k) { |
| i__ = irn[k]; |
| ai = (d__1 = a[k], abs(d__1)); |
| if (ai > d__[i__]) { |
| d__[i__] = ai; |
| } |
| if (jperm[j] != 0) { |
| goto L30; |
| } |
| if (ai >= bv) { |
| a0 = bv; |
| if (iperm[i__] != 0) { |
| goto L30; |
| } |
| jperm[j] = i__; |
| iperm[i__] = j; |
| ++(*num); |
| } else { |
| if (ai <= a0) { |
| goto L30; |
| } |
| a0 = ai; |
| i0 = i__; |
| } |
| L30: |
| ; |
| } |
| if (a0 != -1. && a0 < bv) { |
| bv = a0; |
| if (iperm[i0] != 0) { |
| goto L20; |
| } |
| iperm[i0] = j; |
| jperm[j] = i0; |
| ++(*num); |
| } |
| L20: |
| ; |
| } |
| /* Update BV with smallest of all the largest maximum absolute values */ |
| /* of the rows. */ |
| i__1 = *n; |
| for (i__ = 1; i__ <= i__1; ++i__) { |
| /* Computing MIN */ |
| d__1 = bv, d__2 = d__[i__]; |
| bv = min(d__1,d__2); |
| /* L25: */ |
| } |
| if (*num == *n) { |
| goto L1000; |
| } |
| /* Rescan unassigned columns; improve initial assignment */ |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| if (jperm[j] != 0) { |
| goto L95; |
| } |
| i__2 = ip[j + 1] - 1; |
| for (k = ip[j]; k <= i__2; ++k) { |
| i__ = irn[k]; |
| ai = (d__1 = a[k], abs(d__1)); |
| if (ai < bv) { |
| goto L50; |
| } |
| if (iperm[i__] == 0) { |
| goto L90; |
| } |
| jj = iperm[i__]; |
| kk1 = pr[jj]; |
| kk2 = ip[jj + 1] - 1; |
| if (kk1 > kk2) { |
| goto L50; |
| } |
| i__3 = kk2; |
| for (kk = kk1; kk <= i__3; ++kk) { |
| ii = irn[kk]; |
| if (iperm[ii] != 0) { |
| goto L70; |
| } |
| if ((d__1 = a[kk], abs(d__1)) >= bv) { |
| goto L80; |
| } |
| L70: |
| ; |
| } |
| pr[jj] = kk2 + 1; |
| L50: |
| ; |
| } |
| goto L95; |
| L80: |
| jperm[jj] = ii; |
| iperm[ii] = jj; |
| pr[jj] = kk + 1; |
| L90: |
| ++(*num); |
| jperm[j] = i__; |
| iperm[i__] = j; |
| pr[j] = k + 1; |
| L95: |
| ; |
| } |
| if (*num == *n) { |
| goto L1000; |
| } |
| /* Prepare for main loop */ |
| i__1 = *n; |
| for (i__ = 1; i__ <= i__1; ++i__) { |
| d__[i__] = -1.; |
| l[i__] = 0; |
| /* L99: */ |
| } |
| /* Main loop ... each pass round this loop is similar to Dijkstra's */ |
| /* algorithm for solving the single source shortest path problem */ |
| i__1 = *n; |
| for (jord = 1; jord <= i__1; ++jord) { |
| if (jperm[jord] != 0) { |
| goto L100; |
| } |
| qlen = 0; |
| low = *n + 1; |
| up = *n + 1; |
| /* CSP is cost of shortest path to any unassigned row */ |
| /* ISP is matrix position of unassigned row element in shortest path */ |
| /* JSP is column index of unassigned row element in shortest path */ |
| csp = -1.; |
| /* Build shortest path tree starting from unassigned column JORD */ |
| j = jord; |
| pr[j] = -1; |
| /* Scan column J */ |
| i__2 = ip[j + 1] - 1; |
| for (k = ip[j]; k <= i__2; ++k) { |
| i__ = irn[k]; |
| dnew = (d__1 = a[k], abs(d__1)); |
| if (csp >= dnew) { |
| goto L115; |
| } |
| if (iperm[i__] == 0) { |
| /* Row I is unassigned; update shortest path info */ |
| csp = dnew; |
| isp = i__; |
| jsp = j; |
| if (csp >= bv) { |
| goto L160; |
| } |
| } else { |
| d__[i__] = dnew; |
| if (dnew >= bv) { |
| /* Add row I to Q2 */ |
| --low; |
| q[low] = i__; |
| } else { |
| /* Add row I to Q, and push it */ |
| ++qlen; |
| l[i__] = qlen; |
| mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__1); |
| } |
| jj = iperm[i__]; |
| pr[jj] = j; |
| } |
| L115: |
| ; |
| } |
| i__2 = *num; |
| for (jdum = 1; jdum <= i__2; ++jdum) { |
| /* If Q2 is empty, extract new rows from Q */ |
| if (low == up) { |
| if (qlen == 0) { |
| goto L160; |
| } |
| i__ = q[1]; |
| if (csp >= d__[i__]) { |
| goto L160; |
| } |
| bv = d__[i__]; |
| i__3 = *n; |
| for (idum = 1; idum <= i__3; ++idum) { |
| mc64ed_(&qlen, n, &q[1], &d__[1], &l[1], &c__1); |
| l[i__] = 0; |
| --low; |
| q[low] = i__; |
| if (qlen == 0) { |
| goto L153; |
| } |
| i__ = q[1]; |
| if (d__[i__] != bv) { |
| goto L153; |
| } |
| /* L152: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| } |
| /* Move row Q0 */ |
| L153: |
| --up; |
| q0 = q[up]; |
| dq0 = d__[q0]; |
| l[q0] = up; |
| /* Scan column that matches with row Q0 */ |
| j = iperm[q0]; |
| i__3 = ip[j + 1] - 1; |
| for (k = ip[j]; k <= i__3; ++k) { |
| i__ = irn[k]; |
| /* Update D(I) */ |
| if (l[i__] >= up) { |
| goto L155; |
| } |
| /* Computing MIN */ |
| d__2 = dq0, d__3 = (d__1 = a[k], abs(d__1)); |
| dnew = min(d__2,d__3); |
| if (csp >= dnew) { |
| goto L155; |
| } |
| if (iperm[i__] == 0) { |
| /* Row I is unassigned; update shortest path info */ |
| csp = dnew; |
| isp = i__; |
| jsp = j; |
| if (csp >= bv) { |
| goto L160; |
| } |
| } else { |
| di = d__[i__]; |
| if (di >= bv || di >= dnew) { |
| goto L155; |
| } |
| d__[i__] = dnew; |
| if (dnew >= bv) { |
| /* Delete row I from Q (if necessary); add row I to Q2 */ |
| if (di != -1.) { |
| mc64fd_(&l[i__], &qlen, n, &q[1], &d__[1], &l[1], |
| &c__1); |
| } |
| l[i__] = 0; |
| --low; |
| q[low] = i__; |
| } else { |
| /* Add row I to Q (if necessary); push row I up Q */ |
| if (di == -1.) { |
| ++qlen; |
| l[i__] = qlen; |
| } |
| mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__1); |
| } |
| /* Update tree */ |
| jj = iperm[i__]; |
| pr[jj] = j; |
| } |
| L155: |
| ; |
| } |
| /* L150: */ |
| } |
| /* If CSP = MINONE, no augmenting path is found */ |
| L160: |
| if (csp == -1.) { |
| goto L190; |
| } |
| /* Update bottleneck value */ |
| bv = min(bv,csp); |
| /* Find augmenting path by tracing backward in PR; update IPERM,JPERM */ |
| ++(*num); |
| i__ = isp; |
| j = jsp; |
| i__2 = *num + 1; |
| for (jdum = 1; jdum <= i__2; ++jdum) { |
| i0 = jperm[j]; |
| jperm[j] = i__; |
| iperm[i__] = j; |
| j = pr[j]; |
| if (j == -1) { |
| goto L190; |
| } |
| i__ = i0; |
| /* L170: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| L190: |
| i__2 = *n; |
| for (kk = up; kk <= i__2; ++kk) { |
| i__ = q[kk]; |
| d__[i__] = -1.; |
| l[i__] = 0; |
| /* L191: */ |
| } |
| i__2 = up - 1; |
| for (kk = low; kk <= i__2; ++kk) { |
| i__ = q[kk]; |
| d__[i__] = -1.; |
| /* L192: */ |
| } |
| i__2 = qlen; |
| for (kk = 1; kk <= i__2; ++kk) { |
| i__ = q[kk]; |
| d__[i__] = -1.; |
| l[i__] = 0; |
| /* L193: */ |
| } |
| L100: |
| ; |
| } |
| /* End of main loop */ |
| /* BV is bottleneck value of final matching */ |
| if (*num == *n) { |
| goto L1000; |
| } |
| /* Matrix is structurally singular, complete IPERM. */ |
| /* JPERM, PR are work arrays */ |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| jperm[j] = 0; |
| /* L300: */ |
| } |
| k = 0; |
| i__1 = *n; |
| for (i__ = 1; i__ <= i__1; ++i__) { |
| if (iperm[i__] == 0) { |
| ++k; |
| pr[k] = i__; |
| } else { |
| j = iperm[i__]; |
| jperm[j] = i__; |
| } |
| /* L310: */ |
| } |
| k = 0; |
| i__1 = *n; |
| for (i__ = 1; i__ <= i__1; ++i__) { |
| if (jperm[i__] != 0) { |
| goto L320; |
| } |
| ++k; |
| jdum = pr[k]; |
| iperm[jdum] = i__; |
| L320: |
| ; |
| } |
| L1000: |
| return 0; |
| } /* mc64bd_ */ |
| /* ********************************************************************** */ |
| /* Subroutine */ int_t mc64dd_(int_t *i__, int_t *n, int_t *q, double |
| *d__, int_t *l, int_t *iway) |
| { |
| /* System generated locals */ |
| int_t i__1; |
| /* Local variables */ |
| double di; |
| int_t qk, pos, idum, posk; |
| /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ |
| /* Research Councils *** */ |
| /* *** Although every effort has been made to ensure robustness and *** */ |
| /* *** reliability of the subroutines in this MC64 suite, we *** */ |
| /* *** disclaim any liability arising through the use or misuse of *** */ |
| /* *** any of the subroutines. *** */ |
| /* *** Any problems? Contact ... */ |
| /* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ |
| /* Variables N,Q,D,L are described in MC64B/BD */ |
| /* IF IWAY is equal to 1, then */ |
| /* node I is pushed from its current position upwards */ |
| /* IF IWAY is not equal to 1, then */ |
| /* node I is pushed from its current position downwards */ |
| /* Local variables and parameters */ |
| /* Parameter adjustments */ |
| --l; |
| --d__; |
| --q; |
| /* Function Body */ |
| di = d__[*i__]; |
| pos = l[*i__]; |
| /* POS is index of current position of I in the tree */ |
| if (*iway == 1) { |
| i__1 = *n; |
| for (idum = 1; idum <= i__1; ++idum) { |
| if (pos <= 1) { |
| goto L20; |
| } |
| posk = pos / 2; |
| qk = q[posk]; |
| if (di <= d__[qk]) { |
| goto L20; |
| } |
| q[pos] = qk; |
| l[qk] = pos; |
| pos = posk; |
| /* L10: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| } else { |
| i__1 = *n; |
| for (idum = 1; idum <= i__1; ++idum) { |
| if (pos <= 1) { |
| goto L20; |
| } |
| posk = pos / 2; |
| qk = q[posk]; |
| if (di >= d__[qk]) { |
| goto L20; |
| } |
| q[pos] = qk; |
| l[qk] = pos; |
| pos = posk; |
| /* L15: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| } |
| /* End of dummy if; this point is never reached */ |
| L20: |
| q[pos] = *i__; |
| l[*i__] = pos; |
| return 0; |
| } /* mc64dd_ */ |
| /* ********************************************************************** */ |
| /* Subroutine */ int_t mc64ed_(int_t *qlen, int_t *n, int_t *q, |
| double *d__, int_t *l, int_t *iway) |
| { |
| /* System generated locals */ |
| int_t i__1; |
| /* Local variables */ |
| int_t i__; |
| double di, dk, dr; |
| int_t pos, idum, posk; |
| /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ |
| /* Research Councils *** */ |
| /* *** Although every effort has been made to ensure robustness and *** */ |
| /* *** reliability of the subroutines in this MC64 suite, we *** */ |
| /* *** disclaim any liability arising through the use or misuse of *** */ |
| /* *** any of the subroutines. *** */ |
| /* *** Any problems? Contact ... */ |
| /* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ |
| /* Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or */ |
| /* MC64W/WD (IWAY = 2) */ |
| /* The root node is deleted from the binary heap. */ |
| /* Local variables and parameters */ |
| /* Move last element to begin of Q */ |
| /* Parameter adjustments */ |
| --l; |
| --d__; |
| --q; |
| /* Function Body */ |
| i__ = q[*qlen]; |
| di = d__[i__]; |
| --(*qlen); |
| pos = 1; |
| if (*iway == 1) { |
| i__1 = *n; |
| for (idum = 1; idum <= i__1; ++idum) { |
| posk = pos << 1; |
| if (posk > *qlen) { |
| goto L20; |
| } |
| dk = d__[q[posk]]; |
| if (posk < *qlen) { |
| dr = d__[q[posk + 1]]; |
| if (dk < dr) { |
| ++posk; |
| dk = dr; |
| } |
| } |
| if (di >= dk) { |
| goto L20; |
| } |
| /* Exchange old last element with larger priority child */ |
| q[pos] = q[posk]; |
| l[q[pos]] = pos; |
| pos = posk; |
| /* L10: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| } else { |
| i__1 = *n; |
| for (idum = 1; idum <= i__1; ++idum) { |
| posk = pos << 1; |
| if (posk > *qlen) { |
| goto L20; |
| } |
| dk = d__[q[posk]]; |
| if (posk < *qlen) { |
| dr = d__[q[posk + 1]]; |
| if (dk > dr) { |
| ++posk; |
| dk = dr; |
| } |
| } |
| if (di <= dk) { |
| goto L20; |
| } |
| /* Exchange old last element with smaller child */ |
| q[pos] = q[posk]; |
| l[q[pos]] = pos; |
| pos = posk; |
| /* L15: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| } |
| /* End of dummy if; this point is never reached */ |
| L20: |
| q[pos] = i__; |
| l[i__] = pos; |
| return 0; |
| } /* mc64ed_ */ |
| /* ********************************************************************** */ |
| /* Subroutine */ int_t mc64fd_(int_t *pos0, int_t *qlen, int_t *n, |
| int_t *q, double *d__, int_t *l, int_t *iway) |
| { |
| /* System generated locals */ |
| int_t i__1; |
| /* Local variables */ |
| int_t i__; |
| double di, dk, dr; |
| int_t qk, pos, idum, posk; |
| /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ |
| /* Research Councils *** */ |
| /* *** Although every effort has been made to ensure robustness and *** */ |
| /* *** reliability of the subroutines in this MC64 suite, we *** */ |
| /* *** disclaim any liability arising through the use or misuse of *** */ |
| /* *** any of the subroutines. *** */ |
| /* *** Any problems? Contact ... */ |
| /* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ |
| /* Variables QLEN,N,Q,D,L are described in MC64B/BD (IWAY = 1) or */ |
| /* MC64WD (IWAY = 2). */ |
| /* Move last element in the heap */ |
| /* Quick return, if possible */ |
| /* Parameter adjustments */ |
| --l; |
| --d__; |
| --q; |
| /* Function Body */ |
| if (*qlen == *pos0) { |
| --(*qlen); |
| return 0; |
| } |
| /* Move last element from queue Q to position POS0 */ |
| /* POS is current position of node I in the tree */ |
| i__ = q[*qlen]; |
| di = d__[i__]; |
| --(*qlen); |
| pos = *pos0; |
| if (*iway == 1) { |
| i__1 = *n; |
| for (idum = 1; idum <= i__1; ++idum) { |
| if (pos <= 1) { |
| goto L20; |
| } |
| posk = pos / 2; |
| qk = q[posk]; |
| if (di <= d__[qk]) { |
| goto L20; |
| } |
| q[pos] = qk; |
| l[qk] = pos; |
| pos = posk; |
| /* L10: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| L20: |
| q[pos] = i__; |
| l[i__] = pos; |
| i__1 = *n; |
| for (idum = 1; idum <= i__1; ++idum) { |
| posk = pos << 1; |
| if (posk > *qlen) { |
| goto L40; |
| } |
| dk = d__[q[posk]]; |
| if (posk < *qlen) { |
| dr = d__[q[posk + 1]]; |
| if (dk < dr) { |
| ++posk; |
| dk = dr; |
| } |
| } |
| if (di >= dk) { |
| goto L40; |
| } |
| qk = q[posk]; |
| q[pos] = qk; |
| l[qk] = pos; |
| pos = posk; |
| /* L30: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| } else { |
| i__1 = *n; |
| for (idum = 1; idum <= i__1; ++idum) { |
| if (pos <= 1) { |
| goto L34; |
| } |
| posk = pos / 2; |
| qk = q[posk]; |
| if (di >= d__[qk]) { |
| goto L34; |
| } |
| q[pos] = qk; |
| l[qk] = pos; |
| pos = posk; |
| /* L32: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| L34: |
| q[pos] = i__; |
| l[i__] = pos; |
| i__1 = *n; |
| for (idum = 1; idum <= i__1; ++idum) { |
| posk = pos << 1; |
| if (posk > *qlen) { |
| goto L40; |
| } |
| dk = d__[q[posk]]; |
| if (posk < *qlen) { |
| dr = d__[q[posk + 1]]; |
| if (dk > dr) { |
| ++posk; |
| dk = dr; |
| } |
| } |
| if (di <= dk) { |
| goto L40; |
| } |
| qk = q[posk]; |
| q[pos] = qk; |
| l[qk] = pos; |
| pos = posk; |
| /* L36: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| } |
| /* End of dummy if; this point is never reached */ |
| L40: |
| q[pos] = i__; |
| l[i__] = pos; |
| return 0; |
| } /* mc64fd_ */ |
| /* ********************************************************************** */ |
| /* Subroutine */ int_t mc64rd_(int_t *n, int_t *ne, int_t *ip, int_t * |
| irn, double *a) |
| { |
| /* System generated locals */ |
| int_t i__1, i__2, i__3; |
| /* Local variables */ |
| int_t j, k, r__, s; |
| double ha; |
| int_t hi, td, mid, len, ipj; |
| double key; |
| int_t last, todo[50], first; |
| /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ |
| /* Research Councils *** */ |
| /* *** Although every effort has been made to ensure robustness and *** */ |
| /* *** reliability of the subroutines in this MC64 suite, we *** */ |
| /* *** disclaim any liability arising through the use or misuse of *** */ |
| /* *** any of the subroutines. *** */ |
| /* *** Any problems? Contact ... */ |
| /* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ |
| /* This subroutine sorts the entries in each column of the */ |
| /* sparse matrix (defined by N,NE,IP,IRN,A) by decreasing */ |
| /* numerical value. */ |
| /* Local constants */ |
| /* Local variables */ |
| /* Local arrays */ |
| /* Parameter adjustments */ |
| --ip; |
| --a; |
| --irn; |
| /* Function Body */ |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| len = ip[j + 1] - ip[j]; |
| if (len <= 1) { |
| goto L100; |
| } |
| ipj = ip[j]; |
| /* Sort array roughly with partial quicksort */ |
| if (len < 15) { |
| goto L400; |
| } |
| todo[0] = ipj; |
| todo[1] = ipj + len; |
| td = 2; |
| L500: |
| first = todo[td - 2]; |
| last = todo[td - 1]; |
| /* KEY is the smallest of two values present in interval [FIRST,LAST) */ |
| key = a[(first + last) / 2]; |
| i__2 = last - 1; |
| for (k = first; k <= i__2; ++k) { |
| ha = a[k]; |
| if (ha == key) { |
| goto L475; |
| } |
| if (ha > key) { |
| goto L470; |
| } |
| key = ha; |
| goto L470; |
| L475: |
| ; |
| } |
| /* Only one value found in interval, so it is already sorted */ |
| td += -2; |
| goto L425; |
| /* Reorder interval [FIRST,LAST) such that entries before MID are gt KEY */ |
| L470: |
| mid = first; |
| i__2 = last - 1; |
| for (k = first; k <= i__2; ++k) { |
| if (a[k] <= key) { |
| goto L450; |
| } |
| ha = a[mid]; |
| a[mid] = a[k]; |
| a[k] = ha; |
| hi = irn[mid]; |
| irn[mid] = irn[k]; |
| irn[k] = hi; |
| ++mid; |
| L450: |
| ; |
| } |
| /* Both subintervals [FIRST,MID), [MID,LAST) are nonempty */ |
| /* Stack the longest of the two subintervals first */ |
| if (mid - first >= last - mid) { |
| todo[td + 1] = last; |
| todo[td] = mid; |
| todo[td - 1] = mid; |
| /* TODO(TD-1) = FIRST */ |
| } else { |
| todo[td + 1] = mid; |
| todo[td] = first; |
| todo[td - 1] = last; |
| todo[td - 2] = mid; |
| } |
| td += 2; |
| L425: |
| if (td == 0) { |
| goto L400; |
| } |
| /* There is still work to be done */ |
| if (todo[td - 1] - todo[td - 2] >= 15) { |
| goto L500; |
| } |
| /* Next interval is already short enough for straightforward insertion */ |
| td += -2; |
| goto L425; |
| /* Complete sorting with straightforward insertion */ |
| L400: |
| i__2 = ipj + len - 1; |
| for (r__ = ipj + 1; r__ <= i__2; ++r__) { |
| if (a[r__ - 1] < a[r__]) { |
| ha = a[r__]; |
| hi = irn[r__]; |
| a[r__] = a[r__ - 1]; |
| irn[r__] = irn[r__ - 1]; |
| i__3 = ipj + 1; |
| for (s = r__ - 1; s >= i__3; --s) { |
| if (a[s - 1] < ha) { |
| a[s] = a[s - 1]; |
| irn[s] = irn[s - 1]; |
| } else { |
| a[s] = ha; |
| irn[s] = hi; |
| goto L200; |
| } |
| /* L300: */ |
| } |
| a[ipj] = ha; |
| irn[ipj] = hi; |
| } |
| L200: |
| ; |
| } |
| L100: |
| ; |
| } |
| return 0; |
| } /* mc64rd_ */ |
| /* ********************************************************************** */ |
| /* Subroutine */ int_t mc64sd_(int_t *n, int_t *ne, int_t *ip, int_t * |
| irn, double *a, int_t *iperm, int_t *numx, int_t *w, |
| int_t *len, int_t *lenl, int_t *lenh, int_t *fc, int_t *iw, |
| int_t *iw4) |
| { |
| /* System generated locals */ |
| int_t i__1, i__2, i__3, i__4; |
| /* Local variables */ |
| int_t i__, j, k, l, ii, mod, cnt, num; |
| double bval, bmin, bmax, rinf; |
| int_t nval, wlen, idum1, idum2, idum3; |
| extern /* Subroutine */ int_t mc64qd_(int_t *, int_t *, int_t *, |
| int_t *, int_t *, double *, int_t *, double *), |
| mc64ud_(int_t *, int_t *, int_t *, int_t *, int_t *, |
| int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, |
| int_t *, int_t *, int_t *, int_t *); |
| /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ |
| /* Research Councils *** */ |
| /* *** Although every effort has been made to ensure robustness and *** */ |
| /* *** reliability of the subroutines in this MC64 suite, we *** */ |
| /* *** disclaim any liability arising through the use or misuse of *** */ |
| /* *** any of the subroutines. *** */ |
| /* *** Any problems? Contact ... */ |
| /* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ |
| /* N, NE, IP, IRN, are described in MC64A/AD. */ |
| /* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */ |
| /* A(K), K=1..NE, must be set to the value of the entry that */ |
| /* corresponds to IRN(k). The entries in each column must be */ |
| /* non-negative and ordered by decreasing value. */ |
| /* IPERM is an INT_T array of length N. On exit, it contains the */ |
| /* bottleneck matching: IPERM(I) - 0 or row I is matched to column */ |
| /* IPERM(I). */ |
| /* NUMX is an INT_T variable. On exit, it contains the cardinality */ |
| /* of the matching stored in IPERM. */ |
| /* IW is an INT_T work array of length 10N. */ |
| /* FC is an int_t array of length N that contains the list of */ |
| /* unmatched columns. */ |
| /* LEN(J), LENL(J), LENH(J) are int_t arrays of length N that point */ |
| /* to entries in matrix column J. */ |
| /* In the matrix defined by the column parts IP(J)+LENL(J) we know */ |
| /* a matching does not exist; in the matrix defined by the column */ |
| /* parts IP(J)+LENH(J) we know one exists. */ |
| /* LEN(J) lies between LENL(J) and LENH(J) and determines the matrix */ |
| /* that is tested for a maximum matching. */ |
| /* W is an int_t array of length N and contains the indices of the */ |
| /* columns for which LENL ne LENH. */ |
| /* WLEN is number of indices stored in array W. */ |
| /* IW is int_t work array of length N. */ |
| /* IW4 is int_t work array of length 4N used by MC64U/UD. */ |
| /* EXTERNAL FD05AD,MC64QD,MC64UD */ |
| /* DOUBLE PRECISION FD05AD */ |
| /* BMIN and BMAX are such that a maximum matching exists for the input */ |
| /* matrix in which all entries smaller than BMIN are dropped. */ |
| /* For BMAX, a maximum matching does not exist. */ |
| /* BVAL is a value between BMIN and BMAX. */ |
| /* CNT is the number of calls made to MC64U/UD so far. */ |
| /* NUM is the cardinality of last matching found. */ |
| /* Set RINF to largest positive real number */ |
| /* XSL RINF = FD05AD(5) */ |
| /* Parameter adjustments */ |
| --iw4; |
| --iw; |
| --fc; |
| --lenh; |
| --lenl; |
| --len; |
| --w; |
| --iperm; |
| --ip; |
| --a; |
| --irn; |
| /* Function Body */ |
| rinf = dlamch_("Overflow"); |
| /* Compute a first maximum matching from scratch on whole matrix. */ |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| fc[j] = j; |
| iw[j] = 0; |
| len[j] = ip[j + 1] - ip[j]; |
| /* L20: */ |
| } |
| /* The first call to MC64U/UD */ |
| cnt = 1; |
| mod = 1; |
| *numx = 0; |
| mc64ud_(&cnt, &mod, n, &irn[1], ne, &ip[1], &len[1], &fc[1], &iw[1], numx, |
| n, &iw4[1], &iw4[*n + 1], &iw4[(*n << 1) + 1], &iw4[*n * 3 + 1]); |
| /* IW contains a maximum matching of length NUMX. */ |
| num = *numx; |
| if (num != *n) { |
| /* Matrix is structurally singular */ |
| bmax = rinf; |
| } else { |
| /* Matrix is structurally nonsingular, NUM=NUMX=N; */ |
| /* Set BMAX just above the smallest of all the maximum absolute */ |
| /* values of the columns */ |
| bmax = rinf; |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| bval = 0.f; |
| i__2 = ip[j + 1] - 1; |
| for (k = ip[j]; k <= i__2; ++k) { |
| if (a[k] > bval) { |
| bval = a[k]; |
| } |
| /* L25: */ |
| } |
| if (bval < bmax) { |
| bmax = bval; |
| } |
| /* L30: */ |
| } |
| bmax *= 1.001f; |
| } |
| /* Initialize BVAL,BMIN */ |
| bval = 0.f; |
| bmin = 0.f; |
| /* Initialize LENL,LEN,LENH,W,WLEN according to BMAX. */ |
| /* Set LEN(J), LENH(J) just after last entry in column J. */ |
| /* Set LENL(J) just after last entry in column J with value ge BMAX. */ |
| wlen = 0; |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| l = ip[j + 1] - ip[j]; |
| lenh[j] = l; |
| len[j] = l; |
| i__2 = ip[j + 1] - 1; |
| for (k = ip[j]; k <= i__2; ++k) { |
| if (a[k] < bmax) { |
| goto L46; |
| } |
| /* L45: */ |
| } |
| /* Column J is empty or all entries are ge BMAX */ |
| k = ip[j + 1]; |
| L46: |
| lenl[j] = k - ip[j]; |
| /* Add J to W if LENL(J) ne LENH(J) */ |
| if (lenl[j] == l) { |
| goto L48; |
| } |
| ++wlen; |
| w[wlen] = j; |
| L48: |
| ; |
| } |
| /* Main loop */ |
| i__1 = *ne; |
| for (idum1 = 1; idum1 <= i__1; ++idum1) { |
| if (num == *numx) { |
| /* We have a maximum matching in IW; store IW in IPERM */ |
| i__2 = *n; |
| for (i__ = 1; i__ <= i__2; ++i__) { |
| iperm[i__] = iw[i__]; |
| /* L50: */ |
| } |
| /* Keep going round this loop until matching IW is no longer maximum. */ |
| i__2 = *ne; |
| for (idum2 = 1; idum2 <= i__2; ++idum2) { |
| bmin = bval; |
| if (bmax == bmin) { |
| goto L99; |
| } |
| /* Find splitting value BVAL */ |
| mc64qd_(&ip[1], &lenl[1], &len[1], &w[1], &wlen, &a[1], &nval, |
| &bval); |
| if (nval <= 1) { |
| goto L99; |
| } |
| /* Set LEN such that all matrix entries with value lt BVAL are */ |
| /* discarded. Store old LEN in LENH. Do this for all columns W(K). */ |
| /* Each step, either K is incremented or WLEN is decremented. */ |
| k = 1; |
| i__3 = *n; |
| for (idum3 = 1; idum3 <= i__3; ++idum3) { |
| if (k > wlen) { |
| goto L71; |
| } |
| j = w[k]; |
| i__4 = ip[j] + lenl[j]; |
| for (ii = ip[j] + len[j] - 1; ii >= i__4; --ii) { |
| if (a[ii] >= bval) { |
| goto L60; |
| } |
| i__ = irn[ii]; |
| if (iw[i__] != j) { |
| goto L55; |
| } |
| /* Remove entry from matching */ |
| iw[i__] = 0; |
| --num; |
| fc[*n - num] = j; |
| L55: |
| ; |
| } |
| L60: |
| lenh[j] = len[j]; |
| /* IP(J)+LEN(J)-1 is last entry in column ge BVAL */ |
| len[j] = ii - ip[j] + 1; |
| /* If LENH(J) = LENL(J), remove J from W */ |
| if (lenl[j] == lenh[j]) { |
| w[k] = w[wlen]; |
| --wlen; |
| } else { |
| ++k; |
| } |
| /* L70: */ |
| } |
| L71: |
| if (num < *numx) { |
| goto L81; |
| } |
| /* L80: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| /* Set mode for next call to MC64U/UD */ |
| L81: |
| mod = 1; |
| } else { |
| /* We do not have a maximum matching in IW. */ |
| bmax = bval; |
| /* BMIN is the bottleneck value of a maximum matching; */ |
| /* for BMAX the matching is not maximum, so BMAX>BMIN */ |
| /* IF (BMAX .EQ. BMIN) GO TO 99 */ |
| /* Find splitting value BVAL */ |
| mc64qd_(&ip[1], &len[1], &lenh[1], &w[1], &wlen, &a[1], &nval, & |
| bval); |
| if (nval == 0 || bval == bmin) { |
| goto L99; |
| } |
| /* Set LEN such that all matrix entries with value ge BVAL are */ |
| /* inside matrix. Store old LEN in LENL. Do this for all columns W(K). */ |
| /* Each step, either K is incremented or WLEN is decremented. */ |
| k = 1; |
| i__2 = *n; |
| for (idum3 = 1; idum3 <= i__2; ++idum3) { |
| if (k > wlen) { |
| goto L88; |
| } |
| j = w[k]; |
| i__3 = ip[j] + lenh[j] - 1; |
| for (ii = ip[j] + len[j]; ii <= i__3; ++ii) { |
| if (a[ii] < bval) { |
| goto L86; |
| } |
| /* L85: */ |
| } |
| L86: |
| lenl[j] = len[j]; |
| len[j] = ii - ip[j]; |
| if (lenl[j] == lenh[j]) { |
| w[k] = w[wlen]; |
| --wlen; |
| } else { |
| ++k; |
| } |
| /* L87: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| /* Set mode for next call to MC64U/UD */ |
| L88: |
| mod = 0; |
| } |
| ++cnt; |
| mc64ud_(&cnt, &mod, n, &irn[1], ne, &ip[1], &len[1], &fc[1], &iw[1], & |
| num, numx, &iw4[1], &iw4[*n + 1], &iw4[(*n << 1) + 1], &iw4[* |
| n * 3 + 1]); |
| /* IW contains maximum matching of length NUM */ |
| /* L90: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| /* BMIN is bottleneck value of final matching */ |
| L99: |
| if (*numx == *n) { |
| goto L1000; |
| } |
| /* The matrix is structurally singular, complete IPERM */ |
| /* W, IW are work arrays */ |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| w[j] = 0; |
| /* L300: */ |
| } |
| k = 0; |
| i__1 = *n; |
| for (i__ = 1; i__ <= i__1; ++i__) { |
| if (iperm[i__] == 0) { |
| ++k; |
| iw[k] = i__; |
| } else { |
| j = iperm[i__]; |
| w[j] = i__; |
| } |
| /* L310: */ |
| } |
| k = 0; |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| if (w[j] != 0) { |
| goto L320; |
| } |
| ++k; |
| idum1 = iw[k]; |
| iperm[idum1] = j; |
| L320: |
| ; |
| } |
| L1000: |
| return 0; |
| } /* mc64sd_ */ |
| /* ********************************************************************** */ |
| /* Subroutine */ int_t mc64qd_(int_t *ip, int_t *lenl, int_t *lenh, |
| int_t *w, int_t *wlen, double *a, int_t *nval, double * |
| val) |
| { |
| /* System generated locals */ |
| int_t i__1, i__2, i__3; |
| /* Local variables */ |
| int_t j, k, s; |
| double ha; |
| int_t ii, pos; |
| double split[10]; |
| /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ |
| /* Research Councils *** */ |
| /* *** Although every effort has been made to ensure robustness and *** */ |
| /* *** reliability of the subroutines in this MC64 suite, we *** */ |
| /* *** disclaim any liability arising through the use or misuse of *** */ |
| /* *** any of the subroutines. *** */ |
| /* *** Any problems? Contact ... */ |
| /* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ |
| /* This routine searches for at most XX different numerical values */ |
| /* in the columns W(1:WLEN). XX>=2. */ |
| /* Each column J is scanned between IP(J)+LENL(J) and IP(J)+LENH(J)-1 */ |
| /* until XX values are found or all columns have been considered. */ |
| /* On output, NVAL is the number of different values that is found */ |
| /* and SPLIT(1:NVAL) contains the values in decreasing order. */ |
| /* If NVAL > 0, the routine returns VAL = SPLIT((NVAL+1)/2). */ |
| /* Scan columns in W(1:WLEN). For each encountered value, if value not */ |
| /* already present in SPLIT(1:NVAL), insert value such that SPLIT */ |
| /* remains sorted by decreasing value. */ |
| /* The sorting is done by straightforward insertion; therefore the use */ |
| /* of this routine should be avoided for large XX (XX < 20). */ |
| /* Parameter adjustments */ |
| --a; |
| --w; |
| --lenh; |
| --lenl; |
| --ip; |
| /* Function Body */ |
| *nval = 0; |
| i__1 = *wlen; |
| for (k = 1; k <= i__1; ++k) { |
| j = w[k]; |
| i__2 = ip[j] + lenh[j] - 1; |
| for (ii = ip[j] + lenl[j]; ii <= i__2; ++ii) { |
| ha = a[ii]; |
| if (*nval == 0) { |
| split[0] = ha; |
| *nval = 1; |
| } else { |
| /* Check presence of HA in SPLIT */ |
| for (s = *nval; s >= 1; --s) { |
| if (split[s - 1] == ha) { |
| goto L15; |
| } |
| if (split[s - 1] > ha) { |
| pos = s + 1; |
| goto L21; |
| } |
| /* L20: */ |
| } |
| pos = 1; |
| /* The insertion */ |
| L21: |
| i__3 = pos; |
| for (s = *nval; s >= i__3; --s) { |
| split[s] = split[s - 1]; |
| /* L22: */ |
| } |
| split[pos - 1] = ha; |
| ++(*nval); |
| } |
| /* Exit loop if XX values are found */ |
| if (*nval == 10) { |
| goto L11; |
| } |
| L15: |
| ; |
| } |
| /* L10: */ |
| } |
| /* Determine VAL */ |
| L11: |
| if (*nval > 0) { |
| *val = split[(*nval + 1) / 2 - 1]; |
| } |
| return 0; |
| } /* mc64qd_ */ |
| /* ********************************************************************** */ |
| /* Subroutine */ int_t mc64ud_(int_t *id, int_t *mod, int_t *n, int_t * |
| irn, int_t *lirn, int_t *ip, int_t *lenc, int_t *fc, int_t * |
| iperm, int_t *num, int_t *numx, int_t *pr, int_t *arp, |
| int_t *cv, int_t *out) |
| { |
| /* System generated locals */ |
| int_t i__1, i__2, i__3, i__4; |
| /* Local variables */ |
| int_t i__, j, k, j1, ii, kk, id0, id1, in1, in2, nfc, num0, num1, num2, |
| jord, last; |
| /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ |
| /* Research Councils *** */ |
| /* *** Although every effort has been made to ensure robustness and *** */ |
| /* *** reliability of the subroutines in this MC64 suite, we *** */ |
| /* *** disclaim any liability arising through the use or misuse of *** */ |
| /* *** any of the subroutines. *** */ |
| /* *** Any problems? Contact ... */ |
| /* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ |
| /* PR(J) is the previous column to J in the depth first search. */ |
| /* Array PR is used as workspace in the sorting algorithm. */ |
| /* Elements (I,IPERM(I)) I=1,..,N are entries at the end of the */ |
| /* algorithm unless N assignments have not been made in which case */ |
| /* N-NUM pairs (I,IPERM(I)) will not be entries in the matrix. */ |
| /* CV(I) is the most recent loop number (ID+JORD) at which row I */ |
| /* was visited. */ |
| /* ARP(J) is the number of entries in column J which have been scanned */ |
| /* when looking for a cheap assignment. */ |
| /* OUT(J) is one less than the number of entries in column J which have */ |
| /* not been scanned during one pass through the main loop. */ |
| /* NUMX is maximum possible size of matching. */ |
| /* Parameter adjustments */ |
| --out; |
| --cv; |
| --arp; |
| --pr; |
| --iperm; |
| --fc; |
| --lenc; |
| --ip; |
| --irn; |
| /* Function Body */ |
| if (*id == 1) { |
| /* The first call to MC64U/UD. */ |
| /* Initialize CV and ARP; parameters MOD, NUMX are not accessed */ |
| i__1 = *n; |
| for (i__ = 1; i__ <= i__1; ++i__) { |
| cv[i__] = 0; |
| arp[i__] = 0; |
| /* L5: */ |
| } |
| num1 = *n; |
| num2 = *n; |
| } else { |
| /* Not the first call to MC64U/UD. */ |
| /* Re-initialize ARP if entries were deleted since last call to MC64U/UD */ |
| if (*mod == 1) { |
| i__1 = *n; |
| for (i__ = 1; i__ <= i__1; ++i__) { |
| arp[i__] = 0; |
| /* L8: */ |
| } |
| } |
| num1 = *numx; |
| num2 = *n - *numx; |
| } |
| num0 = *num; |
| /* NUM0 is size of input matching */ |
| /* NUM1 is maximum possible size of matching */ |
| /* NUM2 is maximum allowed number of unassigned rows/columns */ |
| /* NUM is size of current matching */ |
| /* Quick return if possible */ |
| /* IF (NUM.EQ.N) GO TO 199 */ |
| /* NFC is number of rows/columns that could not be assigned */ |
| nfc = 0; |
| /* Integers ID0+1 to ID0+N are unique numbers for call ID to MC64U/UD, */ |
| /* so 1st call uses 1..N, 2nd call uses N+1..2N, etc */ |
| id0 = (*id - 1) * *n; |
| /* Main loop. Each pass round this loop either results in a new */ |
| /* assignment or gives a column with no assignment */ |
| i__1 = *n; |
| for (jord = num0 + 1; jord <= i__1; ++jord) { |
| /* Each pass uses unique number ID1 */ |
| id1 = id0 + jord; |
| /* J is unmatched column */ |
| j = fc[jord - num0]; |
| pr[j] = -1; |
| i__2 = jord; |
| for (k = 1; k <= i__2; ++k) { |
| /* Look for a cheap assignment */ |
| if (arp[j] >= lenc[j]) { |
| goto L30; |
| } |
| in1 = ip[j] + arp[j]; |
| in2 = ip[j] + lenc[j] - 1; |
| i__3 = in2; |
| for (ii = in1; ii <= i__3; ++ii) { |
| i__ = irn[ii]; |
| if (iperm[i__] == 0) { |
| goto L80; |
| } |
| /* L20: */ |
| } |
| /* No cheap assignment in row */ |
| arp[j] = lenc[j]; |
| /* Begin looking for assignment chain starting with row J */ |
| L30: |
| out[j] = lenc[j] - 1; |
| /* Inner loop. Extends chain by one or backtracks */ |
| i__3 = jord; |
| for (kk = 1; kk <= i__3; ++kk) { |
| in1 = out[j]; |
| if (in1 < 0) { |
| goto L50; |
| } |
| in2 = ip[j] + lenc[j] - 1; |
| in1 = in2 - in1; |
| /* Forward scan */ |
| i__4 = in2; |
| for (ii = in1; ii <= i__4; ++ii) { |
| i__ = irn[ii]; |
| if (cv[i__] == id1) { |
| goto L40; |
| } |
| /* Column J has not yet been accessed during this pass */ |
| j1 = j; |
| j = iperm[i__]; |
| cv[i__] = id1; |
| pr[j] = j1; |
| out[j1] = in2 - ii - 1; |
| goto L70; |
| L40: |
| ; |
| } |
| /* Backtracking step. */ |
| L50: |
| j1 = pr[j]; |
| if (j1 == -1) { |
| /* No augmenting path exists for column J. */ |
| ++nfc; |
| fc[nfc] = j; |
| if (nfc > num2) { |
| /* A matching of maximum size NUM1 is not possible */ |
| last = jord; |
| goto L101; |
| } |
| goto L100; |
| } |
| j = j1; |
| /* L60: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| L70: |
| ; |
| } |
| /* End of dummy loop; this point is never reached */ |
| /* New assignment is made. */ |
| L80: |
| iperm[i__] = j; |
| arp[j] = ii - ip[j] + 1; |
| ++(*num); |
| i__2 = jord; |
| for (k = 1; k <= i__2; ++k) { |
| j = pr[j]; |
| if (j == -1) { |
| goto L95; |
| } |
| ii = ip[j] + lenc[j] - out[j] - 2; |
| i__ = irn[ii]; |
| iperm[i__] = j; |
| /* L90: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| L95: |
| if (*num == num1) { |
| /* A matching of maximum size NUM1 is found */ |
| last = jord; |
| goto L101; |
| } |
| L100: |
| ; |
| } |
| /* All unassigned columns have been considered */ |
| last = *n; |
| /* Now, a transversal is computed or is not possible. */ |
| /* Complete FC before returning. */ |
| L101: |
| i__1 = *n; |
| for (jord = last + 1; jord <= i__1; ++jord) { |
| ++nfc; |
| fc[nfc] = fc[jord - num0]; |
| /* L110: */ |
| } |
| /* 199 RETURN */ |
| return 0; |
| } /* mc64ud_ */ |
| /* ********************************************************************** */ |
| /* Subroutine */ int_t mc64wd_(int_t *n, int_t *ne, int_t *ip, int_t * |
| irn, double *a, int_t *iperm, int_t *num, int_t *jperm, |
| int_t *out, int_t *pr, int_t *q, int_t *l, double *u, |
| double *d__) |
| { |
| /* System generated locals */ |
| int_t i__1, i__2, i__3; |
| /* Local variables */ |
| int_t i__, j, k, i0, k0, k1, k2, q0; |
| double di; |
| int_t ii, jj, kk; |
| double vj; |
| int_t up; |
| double dq0; |
| int_t kk1, kk2; |
| double csp; |
| int_t isp, jsp, low; |
| double dmin__, dnew; |
| int_t jord, qlen, jdum; |
| double rinf; |
| extern /* Subroutine */ int_t mc64dd_(int_t *, int_t *, int_t *, |
| double *, int_t *, int_t *), mc64ed_(int_t *, int_t *, |
| int_t *, double *, int_t *, int_t *), mc64fd_(int_t * |
| , int_t *, int_t *, int_t *, double *, int_t *, |
| int_t *); |
| /* *** Copyright (c) 1999 Council for the Central Laboratory of the */ |
| /* Research Councils *** */ |
| /* *** Although every effort has been made to ensure robustness and *** */ |
| /* *** reliability of the subroutines in this MC64 suite, we *** */ |
| /* *** disclaim any liability arising through the use or misuse of *** */ |
| /* *** any of the subroutines. *** */ |
| /* *** Any problems? Contact ... */ |
| /* Iain Duff (I.Duff@rl.ac.uk) or Jacko Koster (jak@ii.uib.no) *** */ |
| /* N, NE, IP, IRN are described in MC64A/AD. */ |
| /* A is a REAL (DOUBLE PRECISION in the D-version) array of length NE. */ |
| /* A(K), K=1..NE, must be set to the value of the entry that */ |
| /* corresponds to IRN(K). It is not altered. */ |
| /* All values A(K) must be non-negative. */ |
| /* IPERM is an INT_T array of length N. On exit, it contains the */ |
| /* weighted matching: IPERM(I) = 0 or row I is matched to column */ |
| /* IPERM(I). */ |
| /* NUM is an INT_T variable. On exit, it contains the cardinality of */ |
| /* the matching stored in IPERM. */ |
| /* IW is an INT_T work array of length 5N. */ |
| /* DW is a REAL (DOUBLE PRECISION in the D-version) array of length 2N. */ |
| /* On exit, U = D(1:N) contains the dual row variable and */ |
| /* V = D(N+1:2N) contains the dual column variable. If the matrix */ |
| /* is structurally nonsingular (NUM = N), the following holds: */ |
| /* U(I)+V(J) <= A(I,J) if IPERM(I) |= J */ |
| /* U(I)+V(J) = A(I,J) if IPERM(I) = J */ |
| /* U(I) = 0 if IPERM(I) = 0 */ |
| /* V(J) = 0 if there is no I for which IPERM(I) = J */ |
| /* Local variables */ |
| /* Local parameters */ |
| /* External subroutines and/or functions */ |
| /* EXTERNAL FD05AD,MC64DD,MC64ED,MC64FD */ |
| /* DOUBLE PRECISION FD05AD */ |
| /* Set RINF to largest positive real number */ |
| /* XSL RINF = FD05AD(5) */ |
| /* Parameter adjustments */ |
| --d__; |
| --u; |
| --l; |
| --q; |
| --pr; |
| --out; |
| --jperm; |
| --iperm; |
| --ip; |
| --a; |
| --irn; |
| /* Function Body */ |
| rinf = dlamch_("Overflow"); |
| /* Initialization */ |
| *num = 0; |
| i__1 = *n; |
| for (k = 1; k <= i__1; ++k) { |
| u[k] = rinf; |
| d__[k] = 0.; |
| iperm[k] = 0; |
| jperm[k] = 0; |
| pr[k] = ip[k]; |
| l[k] = 0; |
| /* L10: */ |
| } |
| /* Initialize U(I) */ |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| i__2 = ip[j + 1] - 1; |
| for (k = ip[j]; k <= i__2; ++k) { |
| i__ = irn[k]; |
| if (a[k] > u[i__]) { |
| goto L20; |
| } |
| u[i__] = a[k]; |
| iperm[i__] = j; |
| l[i__] = k; |
| L20: |
| ; |
| } |
| /* L30: */ |
| } |
| i__1 = *n; |
| for (i__ = 1; i__ <= i__1; ++i__) { |
| j = iperm[i__]; |
| if (j == 0) { |
| goto L40; |
| } |
| /* Row I is not empty */ |
| iperm[i__] = 0; |
| if (jperm[j] != 0) { |
| goto L40; |
| } |
| /* Assignment of column J to row I */ |
| ++(*num); |
| iperm[i__] = j; |
| jperm[j] = l[i__]; |
| L40: |
| ; |
| } |
| if (*num == *n) { |
| goto L1000; |
| } |
| /* Scan unassigned columns; improve assignment */ |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| /* JPERM(J) ne 0 iff column J is already assigned */ |
| if (jperm[j] != 0) { |
| goto L95; |
| } |
| k1 = ip[j]; |
| k2 = ip[j + 1] - 1; |
| /* Continue only if column J is not empty */ |
| if (k1 > k2) { |
| goto L95; |
| } |
| vj = rinf; |
| i__2 = k2; |
| for (k = k1; k <= i__2; ++k) { |
| i__ = irn[k]; |
| di = a[k] - u[i__]; |
| if (di > vj) { |
| goto L50; |
| } |
| if (di < vj || di == rinf) { |
| goto L55; |
| } |
| if (iperm[i__] != 0 || iperm[i0] == 0) { |
| goto L50; |
| } |
| L55: |
| vj = di; |
| i0 = i__; |
| k0 = k; |
| L50: |
| ; |
| } |
| d__[j] = vj; |
| k = k0; |
| i__ = i0; |
| if (iperm[i__] == 0) { |
| goto L90; |
| } |
| i__2 = k2; |
| for (k = k0; k <= i__2; ++k) { |
| i__ = irn[k]; |
| if (a[k] - u[i__] > vj) { |
| goto L60; |
| } |
| jj = iperm[i__]; |
| /* Scan remaining part of assigned column JJ */ |
| kk1 = pr[jj]; |
| kk2 = ip[jj + 1] - 1; |
| if (kk1 > kk2) { |
| goto L60; |
| } |
| i__3 = kk2; |
| for (kk = kk1; kk <= i__3; ++kk) { |
| ii = irn[kk]; |
| if (iperm[ii] > 0) { |
| goto L70; |
| } |
| if (a[kk] - u[ii] <= d__[jj]) { |
| goto L80; |
| } |
| L70: |
| ; |
| } |
| pr[jj] = kk2 + 1; |
| L60: |
| ; |
| } |
| goto L95; |
| L80: |
| jperm[jj] = kk; |
| iperm[ii] = jj; |
| pr[jj] = kk + 1; |
| L90: |
| ++(*num); |
| jperm[j] = k; |
| iperm[i__] = j; |
| pr[j] = k + 1; |
| L95: |
| ; |
| } |
| if (*num == *n) { |
| goto L1000; |
| } |
| /* Prepare for main loop */ |
| i__1 = *n; |
| for (i__ = 1; i__ <= i__1; ++i__) { |
| d__[i__] = rinf; |
| l[i__] = 0; |
| /* L99: */ |
| } |
| /* Main loop ... each pass round this loop is similar to Dijkstra's */ |
| /* algorithm for solving the single source shortest path problem */ |
| i__1 = *n; |
| for (jord = 1; jord <= i__1; ++jord) { |
| if (jperm[jord] != 0) { |
| goto L100; |
| } |
| /* JORD is next unmatched column */ |
| /* DMIN is the length of shortest path in the tree */ |
| dmin__ = rinf; |
| qlen = 0; |
| low = *n + 1; |
| up = *n + 1; |
| /* CSP is the cost of the shortest augmenting path to unassigned row */ |
| /* IRN(ISP). The corresponding column index is JSP. */ |
| csp = rinf; |
| /* Build shortest path tree starting from unassigned column (root) JORD */ |
| j = jord; |
| pr[j] = -1; |
| /* Scan column J */ |
| i__2 = ip[j + 1] - 1; |
| for (k = ip[j]; k <= i__2; ++k) { |
| i__ = irn[k]; |
| dnew = a[k] - u[i__]; |
| if (dnew >= csp) { |
| goto L115; |
| } |
| if (iperm[i__] == 0) { |
| csp = dnew; |
| isp = k; |
| jsp = j; |
| } else { |
| if (dnew < dmin__) { |
| dmin__ = dnew; |
| } |
| d__[i__] = dnew; |
| ++qlen; |
| q[qlen] = k; |
| } |
| L115: |
| ; |
| } |
| /* Initialize heap Q and Q2 with rows held in Q(1:QLEN) */ |
| q0 = qlen; |
| qlen = 0; |
| i__2 = q0; |
| for (kk = 1; kk <= i__2; ++kk) { |
| k = q[kk]; |
| i__ = irn[k]; |
| if (csp <= d__[i__]) { |
| d__[i__] = rinf; |
| goto L120; |
| } |
| if (d__[i__] <= dmin__) { |
| --low; |
| q[low] = i__; |
| l[i__] = low; |
| } else { |
| ++qlen; |
| l[i__] = qlen; |
| mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__2); |
| } |
| /* Update tree */ |
| jj = iperm[i__]; |
| out[jj] = k; |
| pr[jj] = j; |
| L120: |
| ; |
| } |
| i__2 = *num; |
| for (jdum = 1; jdum <= i__2; ++jdum) { |
| /* If Q2 is empty, extract rows from Q */ |
| if (low == up) { |
| if (qlen == 0) { |
| goto L160; |
| } |
| i__ = q[1]; |
| if (d__[i__] >= csp) { |
| goto L160; |
| } |
| dmin__ = d__[i__]; |
| L152: |
| mc64ed_(&qlen, n, &q[1], &d__[1], &l[1], &c__2); |
| --low; |
| q[low] = i__; |
| l[i__] = low; |
| if (qlen == 0) { |
| goto L153; |
| } |
| i__ = q[1]; |
| if (d__[i__] > dmin__) { |
| goto L153; |
| } |
| goto L152; |
| } |
| /* Q0 is row whose distance D(Q0) to the root is smallest */ |
| L153: |
| q0 = q[up - 1]; |
| dq0 = d__[q0]; |
| /* Exit loop if path to Q0 is longer than the shortest augmenting path */ |
| if (dq0 >= csp) { |
| goto L160; |
| } |
| --up; |
| /* Scan column that matches with row Q0 */ |
| j = iperm[q0]; |
| vj = dq0 - a[jperm[j]] + u[q0]; |
| i__3 = ip[j + 1] - 1; |
| for (k = ip[j]; k <= i__3; ++k) { |
| i__ = irn[k]; |
| if (l[i__] >= up) { |
| goto L155; |
| } |
| /* DNEW is new cost */ |
| dnew = vj + a[k] - u[i__]; |
| /* Do not update D(I) if DNEW ge cost of shortest path */ |
| if (dnew >= csp) { |
| goto L155; |
| } |
| if (iperm[i__] == 0) { |
| /* Row I is unmatched; update shortest path info */ |
| csp = dnew; |
| isp = k; |
| jsp = j; |
| } else { |
| /* Row I is matched; do not update D(I) if DNEW is larger */ |
| di = d__[i__]; |
| if (di <= dnew) { |
| goto L155; |
| } |
| if (l[i__] >= low) { |
| goto L155; |
| } |
| d__[i__] = dnew; |
| if (dnew <= dmin__) { |
| if (l[i__] != 0) { |
| mc64fd_(&l[i__], &qlen, n, &q[1], &d__[1], &l[1], |
| &c__2); |
| } |
| --low; |
| q[low] = i__; |
| l[i__] = low; |
| } else { |
| if (l[i__] == 0) { |
| ++qlen; |
| l[i__] = qlen; |
| } |
| mc64dd_(&i__, n, &q[1], &d__[1], &l[1], &c__2); |
| } |
| /* Update tree */ |
| jj = iperm[i__]; |
| out[jj] = k; |
| pr[jj] = j; |
| } |
| L155: |
| ; |
| } |
| /* L150: */ |
| } |
| /* If CSP = RINF, no augmenting path is found */ |
| L160: |
| if (csp == rinf) { |
| goto L190; |
| } |
| /* Find augmenting path by tracing backward in PR; update IPERM,JPERM */ |
| ++(*num); |
| i__ = irn[isp]; |
| iperm[i__] = jsp; |
| jperm[jsp] = isp; |
| j = jsp; |
| i__2 = *num; |
| for (jdum = 1; jdum <= i__2; ++jdum) { |
| jj = pr[j]; |
| if (jj == -1) { |
| goto L180; |
| } |
| k = out[j]; |
| i__ = irn[k]; |
| iperm[i__] = jj; |
| jperm[jj] = k; |
| j = jj; |
| /* L170: */ |
| } |
| /* End of dummy loop; this point is never reached */ |
| /* Update U for rows in Q(UP:N) */ |
| L180: |
| i__2 = *n; |
| for (kk = up; kk <= i__2; ++kk) { |
| i__ = q[kk]; |
| u[i__] = u[i__] + d__[i__] - csp; |
| /* L185: */ |
| } |
| L190: |
| i__2 = *n; |
| for (kk = low; kk <= i__2; ++kk) { |
| i__ = q[kk]; |
| d__[i__] = rinf; |
| l[i__] = 0; |
| /* L191: */ |
| } |
| i__2 = qlen; |
| for (kk = 1; kk <= i__2; ++kk) { |
| i__ = q[kk]; |
| d__[i__] = rinf; |
| l[i__] = 0; |
| /* L193: */ |
| } |
| L100: |
| ; |
| } |
| /* End of main loop */ |
| /* Set dual column variable in D(1:N) */ |
| L1000: |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| k = jperm[j]; |
| if (k != 0) { |
| d__[j] = a[k] - u[irn[k]]; |
| } else { |
| d__[j] = 0.; |
| } |
| if (iperm[j] == 0) { |
| u[j] = 0.; |
| } |
| /* L200: */ |
| } |
| if (*num == *n) { |
| goto L1100; |
| } |
| /* The matrix is structurally singular, complete IPERM. */ |
| /* JPERM, OUT are work arrays */ |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| jperm[j] = 0; |
| /* L300: */ |
| } |
| k = 0; |
| i__1 = *n; |
| for (i__ = 1; i__ <= i__1; ++i__) { |
| if (iperm[i__] == 0) { |
| ++k; |
| out[k] = i__; |
| } else { |
| j = iperm[i__]; |
| jperm[j] = i__; |
| } |
| /* L310: */ |
| } |
| k = 0; |
| i__1 = *n; |
| for (j = 1; j <= i__1; ++j) { |
| if (jperm[j] != 0) { |
| goto L320; |
| } |
| ++k; |
| jdum = out[k]; |
| iperm[jdum] = j; |
| L320: |
| ; |
| } |
| L1100: |
| return 0; |
| } /* mc64wd_ */ |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/superlu_enum_consts.h |
|---|
| New file |
| 0,0 → 1,72 |
| /** @file superlu_enum_consts.h |
| * \brief enum constants header file |
| * |
| * -- SuperLU routine (version 4.1) -- |
| * Lawrence Berkeley National Lab, Univ. of California Berkeley, |
| * October 1, 2010 |
| * |
| */ |
| #ifndef __SUPERLU_ENUM_CONSTS /* allow multiple inclusions */ |
| #define __SUPERLU_ENUM_CONSTS |
| /*********************************************************************** |
| * Enumerate types |
| ***********************************************************************/ |
| typedef enum {NO, YES} yes_no_t; |
| typedef enum {DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED} fact_t; |
| typedef enum {NOROWPERM, LargeDiag, MY_PERMR} rowperm_t; |
| typedef enum {NATURAL, MMD_ATA, MMD_AT_PLUS_A, COLAMD, |
| METIS_AT_PLUS_A, PARMETIS, ZOLTAN, MY_PERMC} colperm_t; |
| typedef enum {NOTRANS, TRANS, CONJ} trans_t; |
| typedef enum {NOEQUIL, ROW, COL, BOTH} DiagScale_t; |
| typedef enum {NOREFINE, SINGLE=1, DOUBLE, EXTRA} IterRefine_t; |
| typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType; |
| typedef enum {HEAD, TAIL} stack_end_t; |
| typedef enum {SYSTEM, USER} LU_space_t; |
| typedef enum {ONE_NORM, TWO_NORM, INF_NORM} norm_t; |
| typedef enum {SILU, SMILU_1, SMILU_2, SMILU_3} milu_t; |
| #if 0 |
| typedef enum {NODROP = 0x0000, |
| DROP_BASIC = 0x0001, /* ILU(tau) */ |
| DROP_PROWS = 0x0002, /* ILUTP: keep p maximum rows */ |
| DROP_COLUMN = 0x0004, /* ILUTP: for j-th column, |
| p = gamma * nnz(A(:,j)) */ |
| DROP_AREA = 0x0008, /* ILUTP: for j-th column, use |
| nnz(F(:,1:j)) / nnz(A(:,1:j)) |
| to limit memory growth */ |
| DROP_SECONDARY = 0x000E, /* PROWS | COLUMN | AREA */ |
| DROP_DYNAMIC = 0x0010, |
| DROP_INTERP = 0x0100} rule_t; |
| #endif |
| /* |
| * The following enumerate type is used by the statistics variable |
| * to keep track of flop count and time spent at various stages. |
| * |
| * Note that not all of the fields are disjoint. |
| */ |
| typedef enum { |
| COLPERM, /* find a column ordering that minimizes fills */ |
| ROWPERM, /* find a row ordering maximizes diagonal. */ |
| RELAX, /* find artificial supernodes */ |
| ETREE, /* compute column etree */ |
| EQUIL, /* equilibrate the original matrix */ |
| SYMBFAC, /* symbolic factorization. */ |
| DIST, /* distribute matrix. */ |
| FACT, /* perform LU factorization */ |
| COMM, /* communication for factorization */ |
| SOL_COMM,/* communication for solve */ |
| RCOND, /* estimate reciprocal condition number */ |
| SOLVE, /* forward and back solves */ |
| REFINE, /* perform iterative refinement */ |
| FLOAT, /* time spent in floating-point operations */ |
| TRSV, /* fraction of FACT spent in xTRSV */ |
| GEMV, /* fraction of FACT spent in xGEMV */ |
| FERR, /* estimate error bounds after iterative refinement */ |
| NPHASES /* total number of phases */ |
| } PhaseType; |
| #endif /* __SUPERLU_ENUM_CONSTS */ |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/pdgstrf_X1.c |
|---|
| New file |
| 0,0 → 1,1337 |
| /*! @file |
| * \brief Performs the LU factorization in parallel |
| * |
| * <pre> |
| * -- Distributed SuperLU routine (version 1.0) -- |
| * Lawrence Berkeley National Lab, Univ. of California Berkeley. |
| * September 1, 1999 |
| * |
| * Modified: |
| * Feburary 7, 2001 use MPI_Isend/MPI_Irecv |
| * |
| * Sketch of the algorithm |
| * ======================= |
| * |
| * The following relations hold: |
| * * A_kk = L_kk * U_kk |
| * * L_ik = Aik * U_kk^(-1) |
| * * U_kj = L_kk^(-1) * A_kj |
| * |
| * ---------------------------------- |
| * | | | |
| * ----|----------------------------- |
| * | | \ U_kk| | |
| * | | \ | U_kj | |
| * | |L_kk \ | || | |
| * ----|-------|---------||---------- |
| * | | | \/ | |
| * | | | | |
| * | | | | |
| * | | | | |
| * | | L_ik ==> A_ij | |
| * | | | | |
| * | | | | |
| * | | | | |
| * ---------------------------------- |
| * |
| * Handle the first block of columns separately. |
| * * Factor diagonal and subdiagonal blocks and test for exact |
| * singularity. ( pdgstrf2(0), one column at a time ) |
| * * Compute block row of U |
| * * Update trailing matrix |
| * |
| * Loop over the remaining blocks of columns. |
| * mycol = MYCOL( iam, grid ); |
| * myrow = MYROW( iam, grid ); |
| * N = nsupers; |
| * For (k = 1; k < N; ++k) { |
| * krow = PROW( k, grid ); |
| * kcol = PCOL( k, grid ); |
| * Pkk = PNUM( krow, kcol, grid ); |
| * |
| * * Factor diagonal and subdiagonal blocks and test for exact |
| * singularity. |
| * if ( mycol == kcol ) { |
| * pdgstrf2(k), one column at a time |
| * } |
| * |
| * * Parallel triangular solve |
| * if ( iam == Pkk ) multicast L_k,k to this process row; |
| * if ( myrow == krow && mycol != kcol ) { |
| * Recv L_k,k from process Pkk; |
| * for (j = k+1; j < N; ++j) |
| * if ( PCOL( j, grid ) == mycol && A_k,j != 0 ) |
| * U_k,j = L_k,k \ A_k,j; |
| * } |
| * |
| * * Parallel rank-k update |
| * if ( myrow == krow ) multicast U_k,k+1:N to this process column; |
| * if ( mycol == kcol ) multicast L_k+1:N,k to this process row; |
| * if ( myrow != krow ) { |
| * Pkj = PNUM( krow, mycol, grid ); |
| * Recv U_k,k+1:N from process Pkj; |
| * } |
| * if ( mycol != kcol ) { |
| * Pik = PNUM( myrow, kcol, grid ); |
| * Recv L_k+1:N,k from process Pik; |
| * } |
| * for (j = k+1; k < N; ++k) { |
| * for (i = k+1; i < N; ++i) |
| * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) |
| * && L_i,k != 0 && U_k,j != 0 ) |
| * A_i,j = A_i,j - L_i,k * U_k,j; |
| * } |
| * } |
| * |
| * |
| * Remaining issues |
| * (1) Use local indices for L subscripts and SPA. [DONE] |
| * </pre> |
| */ |
| #include <math.h> |
| #include "superlu_ddefs.h" |
| #define CRAY_X1 |
| #if ( VAMPIR>=1 ) |
| #include <VT.h> |
| #endif |
| /* |
| * Internal prototypes |
| */ |
| static void pdgstrf2(superlu_options_t *, int_t, double, Glu_persist_t *, |
| gridinfo_t *, LocalLU_t *, SuperLUStat_t *, int *); |
| #ifdef _CRAY |
| static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *, |
| LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd); |
| #else |
| static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *, |
| LocalLU_t *, SuperLUStat_t *); |
| #endif |
| /* |
| * |
| */ |
| /************************************************************************/ |
| /*! \brief |
| * |
| * <pre> |
| * Purpose |
| * ======= |
| * |
| * PDGSTRF performs the LU factorization in parallel. |
| * |
| * Arguments |
| * ========= |
| * |
| * options (input) superlu_options_t* |
| * The structure defines the input parameters to control |
| * how the LU decomposition will be performed. |
| * The following field should be defined: |
| * o ReplaceTinyPivot (yes_no_t) |
| * Specifies whether to replace the tiny diagonals by |
| * sqrt(epsilon)*norm(A) during LU factorization. |
| * |
| * m (input) int |
| * Number of rows in the matrix. |
| * |
| * n (input) int |
| * Number of columns in the matrix. |
| * |
| * anorm (input) double |
| * The norm of the original matrix A, or the scaled A if |
| * equilibration was done. |
| * |
| * LUstruct (input/output) LUstruct_t* |
| * The data structures to store the distributed L and U factors. |
| * The following fields should be defined: |
| * |
| * o Glu_persist (input) Glu_persist_t* |
| * Global data structure (xsup, supno) replicated on all processes, |
| * describing the supernode partition in the factored matrices |
| * L and U: |
| * xsup[s] is the leading column of the s-th supernode, |
| * supno[i] is the supernode number to which column i belongs. |
| * |
| * o Llu (input/output) LocalLU_t* |
| * The distributed data structures to store L and U factors. |
| * See superlu_ddefs.h for the definition of 'LocalLU_t'. |
| * |
| * grid (input) gridinfo_t* |
| * The 2D process mesh. It contains the MPI communicator, the number |
| * of process rows (NPROW), the number of process columns (NPCOL), |
| * and my process rank. It is an input argument to all the |
| * parallel routines. |
| * Grid can be initialized by subroutine SUPERLU_GRIDINIT. |
| * See superlu_ddefs.h for the definition of 'gridinfo_t'. |
| * |
| * stat (output) SuperLUStat_t* |
| * Record the statistics on runtime and floating-point operation count. |
| * See util.h for the definition of 'SuperLUStat_t'. |
| * |
| * info (output) int* |
| * = 0: successful exit |
| * < 0: if info = -i, the i-th argument had an illegal value |
| * > 0: if info = i, U(i,i) is exactly zero. The factorization has |
| * been completed, but the factor U is exactly singular, |
| * and division by zero will occur if it is used to solve a |
| * system of equations. |
| * </pre> |
| */ |
| void pdgstrf |
| /************************************************************************/ |
| ( |
| superlu_options_t *options, int m, int n, double anorm, |
| LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info |
| ) |
| { |
| #ifdef _CRAY |
| _fcd ftcs = _cptofcd("N", strlen("N")); |
| _fcd ftcs1 = _cptofcd("L", strlen("L")); |
| _fcd ftcs2 = _cptofcd("N", strlen("N")); |
| _fcd ftcs3 = _cptofcd("U", strlen("U")); |
| #endif |
| double alpha = 1.0, beta = 0.0; |
| int_t *xsup; |
| int_t *lsub, *lsub1, *usub, *Usub_buf, |
| *Lsub_buf_2[2]; /* Need 2 buffers to implement Irecv. */ |
| double *lusup, *lusup1, *uval, *Uval_buf, |
| *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */ |
| int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, |
| lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, |
| nlb, nub, nsupc, rel, rukp; |
| int_t Pc, Pr; |
| int iam, kcol, krow, mycol, myrow, pi, pj; |
| int j, k, lk, nsupers; |
| int nsupr, nbrow, segsize; |
| int msgcnt[4]; /* Count the size of the message xfer'd in each buffer: |
| * 0 : transferred in Lsub_buf[] |
| * 1 : transferred in Lval_buf[] |
| * 2 : transferred in Usub_buf[] |
| * 3 : transferred in Uval_buf[] |
| */ |
| int_t msg0, msg2; |
| int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; |
| double **Unzval_br_ptr, **Lnzval_bc_ptr; |
| int_t *index; |
| double *nzval; |
| int_t *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */ |
| double *ucol; |
| int_t *indirect; |
| double *tempv, *tempv2d; |
| int_t iinfo; |
| int_t *ToRecv, *ToSendD, **ToSendR; |
| Glu_persist_t *Glu_persist = LUstruct->Glu_persist; |
| LocalLU_t *Llu = LUstruct->Llu; |
| superlu_scope_t *scp; |
| double s_eps, thresh; |
| double *tempU2d, *tempu; |
| int full, ldt, ldu, lead_zero, ncols; |
| MPI_Request recv_req[4], *send_req; |
| MPI_Status status; |
| #ifdef CRAY_X1 |
| int nonzero_segs; |
| #endif |
| #if ( DEBUGlevel>=2 ) |
| int_t num_copy=0, num_update=0; |
| #endif |
| #if ( PRNTlevel==3 ) |
| int_t zero_msg = 0, total_msg = 0; |
| #endif |
| #if ( PROFlevel>=1 ) |
| double t1, t2; |
| float msg_vol = 0, msg_cnt = 0; |
| int_t iword = sizeof(int_t), dword = sizeof(double); |
| #endif |
| /* Test the input parameters. */ |
| *info = 0; |
| if ( m < 0 ) *info = -2; |
| else if ( n < 0 ) *info = -3; |
| if ( *info ) { |
| pxerbla("pdgstrf", grid, -*info); |
| return; |
| } |
| /* Quick return if possible. */ |
| if ( m == 0 || n == 0 ) return; |
| /* |
| * Initialization. |
| */ |
| iam = grid->iam; |
| Pc = grid->npcol; |
| Pr = grid->nprow; |
| myrow = MYROW( iam, grid ); |
| mycol = MYCOL( iam, grid ); |
| nsupers = Glu_persist->supno[n-1] + 1; |
| xsup = Glu_persist->xsup; |
| s_eps = slamch_("Epsilon"); |
| thresh = s_eps * anorm; |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Enter pdgstrf()"); |
| #endif |
| stat->ops[FACT] = 0.0; |
| if ( Pr*Pc > 1 ) { |
| i = Llu->bufmax[0]; |
| if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) ) |
| ABORT("Malloc fails for Lsub_buf."); |
| Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i; |
| i = Llu->bufmax[1]; |
| if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) ) |
| ABORT("Malloc fails for Lval_buf[]."); |
| Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i; |
| if ( Llu->bufmax[2] != 0 ) |
| if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) ) |
| ABORT("Malloc fails for Usub_buf[]."); |
| if ( Llu->bufmax[3] != 0 ) |
| if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) ) |
| ABORT("Malloc fails for Uval_buf[]."); |
| if ( !(send_req = |
| (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request)))) |
| ABORT("Malloc fails for send_req[]."); |
| } |
| if ( !(Llu->ujrow = doubleMalloc_dist(sp_ienv_dist(3))) ) |
| ABORT("Malloc fails for ujrow[]."); |
| #if ( PRNTlevel>=1 ) |
| if ( !iam ) { |
| printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh); |
| printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n", |
| Llu->bufmax[0], Llu->bufmax[1], |
| Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]); |
| } |
| #endif |
| Lsub_buf_2[0] = Llu->Lsub_buf_2[0]; |
| Lsub_buf_2[1] = Llu->Lsub_buf_2[1]; |
| Lval_buf_2[0] = Llu->Lval_buf_2[0]; |
| Lval_buf_2[1] = Llu->Lval_buf_2[1]; |
| Usub_buf = Llu->Usub_buf; |
| Uval_buf = Llu->Uval_buf; |
| Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; |
| Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; |
| Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; |
| Unzval_br_ptr = Llu->Unzval_br_ptr; |
| ToRecv = Llu->ToRecv; |
| ToSendD = Llu->ToSendD; |
| ToSendR = Llu->ToSendR; |
| ldt = sp_ienv_dist(3); /* Size of maximum supernode */ |
| if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) ) |
| ABORT("Calloc fails for tempv2d[]."); |
| tempU2d = tempv2d + ldt*ldt; |
| #ifdef CRAY_X1 |
| if ( !(indirect = intMalloc_dist(2*ldt)) ) |
| ABORT("Malloc fails for indirect[]."); |
| #else |
| if ( !(indirect = intMalloc_dist(ldt)) ) |
| ABORT("Malloc fails for indirect[]."); |
| #endif |
| k = CEILING( nsupers, Pr ); /* Number of local block rows */ |
| if ( !(iuip = intMalloc_dist(k)) ) |
| ABORT("Malloc fails for iuip[]."); |
| if ( !(ruip = intMalloc_dist(k)) ) |
| ABORT("Malloc fails for ruip[]."); |
| #if ( VAMPIR>=1 ) |
| VT_symdef(1, "Send-L", "Comm"); |
| VT_symdef(2, "Recv-L", "Comm"); |
| VT_symdef(3, "Send-U", "Comm"); |
| VT_symdef(4, "Recv-U", "Comm"); |
| VT_symdef(5, "TRF2", "Factor"); |
| VT_symdef(100, "Factor", "Factor"); |
| VT_begin(100); |
| VT_traceon(); |
| #endif |
| /* --------------------------------------------------------------- |
| Handle the first block column separately to start the pipeline. |
| --------------------------------------------------------------- */ |
| if ( mycol == 0 ) { |
| #if ( VAMPIR>=1 ) |
| VT_begin(5); |
| #endif |
| pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info); |
| #if ( VAMPIR>=1 ) |
| VT_end(5); |
| #endif |
| scp = &grid->rscp; /* The scope of process row. */ |
| /* Process column *kcol* multicasts numeric values of L(:,k) |
| to process rows. */ |
| lsub = Lrowind_bc_ptr[0]; |
| lusup = Lnzval_bc_ptr[0]; |
| if ( lsub ) { |
| msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR; |
| msgcnt[1] = lsub[1] * SuperSize( 0 ); |
| } else { |
| msgcnt[0] = msgcnt[1] = 0; |
| } |
| for (pj = 0; pj < Pc; ++pj) { |
| if ( ToSendR[0][pj] != EMPTY ) { |
| #if ( PROFlevel>=1 ) |
| TIC(t1); |
| #endif |
| #if ( VAMPIR>=1 ) |
| VT_begin(1); |
| #endif |
| MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm, |
| &send_req[pj] ); |
| MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm, |
| &send_req[pj+Pc] ); |
| #if ( DEBUGlevel>=2 ) |
| printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", |
| iam, 0, msgcnt[0], msgcnt[1], pj); |
| #endif |
| #if ( VAMPIR>=1 ) |
| VT_end(1); |
| #endif |
| #if ( PROFlevel>=1 ) |
| TOC(t2, t1); |
| stat->utime[COMM] += t2; |
| msg_cnt += 2; |
| msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; |
| #endif |
| } |
| } /* for pj ... */ |
| } else { /* Post immediate receives. */ |
| if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */ |
| scp = &grid->rscp; /* The scope of process row. */ |
| MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0, |
| 0, scp->comm, &recv_req[0] ); |
| MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0, |
| 1, scp->comm, &recv_req[1] ); |
| #if ( DEBUGlevel>=2 ) |
| printf("(%d) Post Irecv L(:,%4d)\n", iam, 0); |
| #endif |
| } |
| } /* if mycol == 0 */ |
| /* ------------------------------------------ |
| MAIN LOOP: Loop through all block columns. |
| ------------------------------------------ */ |
| for (k = 0; k < nsupers; ++k) { |
| knsupc = SuperSize( k ); |
| krow = PROW( k, grid ); |
| kcol = PCOL( k, grid ); |
| if ( mycol == kcol ) { |
| lk = LBj( k, grid ); /* Local block number. */ |
| for (pj = 0; pj < Pc; ++pj) { |
| /* Wait for Isend to complete before using lsub/lusup. */ |
| if ( ToSendR[lk][pj] != EMPTY ) { |
| MPI_Wait( &send_req[pj], &status ); |
| MPI_Wait( &send_req[pj+Pc], &status ); |
| } |
| } |
| lsub = Lrowind_bc_ptr[lk]; |
| lusup = Lnzval_bc_ptr[lk]; |
| } else { |
| if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */ |
| scp = &grid->rscp; /* The scope of process row. */ |
| #if ( PROFlevel>=1 ) |
| TIC(t1); |
| #endif |
| #if ( VAMPIR>=1 ) |
| VT_begin(2); |
| #endif |
| /*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, |
| Llu->bufmax[0]);*/ |
| /*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, |
| (4*k)%NTAGS, scp->comm, &status );*/ |
| MPI_Wait( &recv_req[0], &status ); |
| MPI_Get_count( &status, mpi_int_t, &msgcnt[0] ); |
| /*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, |
| Llu->bufmax[1]);*/ |
| /*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, |
| (4*k+1)%NTAGS, scp->comm, &status );*/ |
| MPI_Wait( &recv_req[1], &status ); |
| MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] ); |
| #if ( VAMPIR>=1 ) |
| VT_end(2); |
| #endif |
| #if ( PROFlevel>=1 ) |
| TOC(t2, t1); |
| stat->utime[COMM] += t2; |
| #endif |
| #if ( DEBUGlevel>=2 ) |
| printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n", |
| iam, k, msgcnt[0], msgcnt[1], kcol); |
| fflush(stdout); |
| #endif |
| lsub = Lsub_buf_2[k%2]; |
| lusup = Lval_buf_2[k%2]; |
| #if ( PRNTlevel==3 ) |
| ++total_msg; |
| if ( !msgcnt[0] ) ++zero_msg; |
| #endif |
| } else msgcnt[0] = 0; |
| } /* if mycol = Pc(k) */ |
| scp = &grid->cscp; /* The scope of process column. */ |
| if ( myrow == krow ) { |
| /* Parallel triangular solve across process row *krow* -- |
| U(k,j) = L(k,k) \ A(k,j). */ |
| #ifdef _CRAY |
| pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3); |
| #else |
| pdgstrs2(n, k, Glu_persist, grid, Llu, stat); |
| #endif |
| /* Multicasts U(k,:) to process columns. */ |
| lk = LBi( k, grid ); |
| usub = Ufstnz_br_ptr[lk]; |
| uval = Unzval_br_ptr[lk]; |
| if ( usub ) { |
| msgcnt[2] = usub[2]; |
| msgcnt[3] = usub[1]; |
| } else { |
| msgcnt[2] = msgcnt[3] = 0; |
| } |
| if ( ToSendD[lk] == YES ) { |
| for (pi = 0; pi < Pr; ++pi) { |
| if ( pi != myrow ) { |
| #if ( PROFlevel>=1 ) |
| TIC(t1); |
| #endif |
| #if ( VAMPIR>=1 ) |
| VT_begin(3); |
| #endif |
| MPI_Send( usub, msgcnt[2], mpi_int_t, pi, |
| (4*k+2)%NTAGS, scp->comm); |
| MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi, |
| (4*k+3)%NTAGS, scp->comm); |
| #if ( VAMPIR>=1 ) |
| VT_end(3); |
| #endif |
| #if ( PROFlevel>=1 ) |
| TOC(t2, t1); |
| stat->utime[COMM] += t2; |
| msg_cnt += 2; |
| msg_vol += msgcnt[2]*iword + msgcnt[3]*dword; |
| #endif |
| #if ( DEBUGlevel>=2 ) |
| printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi); |
| #endif |
| } /* if pi ... */ |
| } /* for pi ... */ |
| } /* if ToSendD ... */ |
| } else { /* myrow != krow */ |
| if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */ |
| #if ( PROFlevel>=1 ) |
| TIC(t1); |
| #endif |
| #if ( VAMPIR>=1 ) |
| VT_begin(4); |
| #endif |
| /*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, |
| Llu->bufmax[2]);*/ |
| MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow, |
| (4*k+2)%NTAGS, scp->comm, &status ); |
| MPI_Get_count( &status, mpi_int_t, &msgcnt[2] ); |
| /*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, |
| Llu->bufmax[3]);*/ |
| MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, |
| (4*k+3)%NTAGS, scp->comm, &status ); |
| MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] ); |
| #if ( VAMPIR>=1 ) |
| VT_end(4); |
| #endif |
| #if ( PROFlevel>=1 ) |
| TOC(t2, t1); |
| stat->utime[COMM] += t2; |
| #endif |
| usub = Usub_buf; |
| uval = Uval_buf; |
| #if ( DEBUGlevel>=2 ) |
| printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow); |
| #endif |
| #if ( PRNTlevel==3 ) |
| ++total_msg; |
| if ( !msgcnt[2] ) ++zero_msg; |
| #endif |
| } else msgcnt[2] = 0; |
| } /* if myrow == Pr(k) */ |
| /* |
| * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). |
| * for (j = k+1; k < N; ++k) { |
| * for (i = k+1; i < N; ++i) |
| * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) |
| * && L(i,k) != 0 && U(k,j) != 0 ) |
| * A(i,j) = A(i,j) - L(i,k) * U(k,j); |
| */ |
| msg0 = msgcnt[0]; |
| msg2 = msgcnt[2]; |
| if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ |
| nsupr = lsub[1]; /* LDA of lusup. */ |
| if ( myrow == krow ) { /* Skip diagonal block L(k,k). */ |
| lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1]; |
| luptr0 = knsupc; |
| nlb = lsub[0] - 1; |
| } else { |
| lptr0 = BC_HEADER; |
| luptr0 = 0; |
| nlb = lsub[0]; |
| } |
| lptr = lptr0; |
| for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */ |
| ib = lsub[lptr]; |
| lib = LBi( ib, grid ); |
| iuip[lib] = BR_HEADER; |
| ruip[lib] = 0; |
| lptr += LB_DESCRIPTOR + lsub[lptr+1]; |
| } |
| nub = usub[0]; /* Number of blocks in the block row U(k,:) */ |
| iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ |
| rukp = 0; /* Pointer to nzval[] of U(k,:) */ |
| klst = FstBlockC( k+1 ); |
| /* --------------------------------------------------- |
| Update the first block column A(:,k+1). |
| --------------------------------------------------- */ |
| jb = usub[iukp]; /* Global block number of block U(k,j). */ |
| if ( jb == k+1 ) { /* First update (k+1)-th block. */ |
| --nub; |
| lptr = lptr0; |
| luptr = luptr0; |
| ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ |
| nsupc = SuperSize( jb ); |
| iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ |
| /* Prepare to call DGEMM. */ |
| jj = iukp; |
| while ( usub[jj] == klst ) ++jj; |
| ldu = klst - usub[jj++]; |
| ncols = 1; |
| full = 1; |
| for (; jj < iukp+nsupc; ++jj) { |
| segsize = klst - usub[jj]; |
| if ( segsize ) { |
| ++ncols; |
| if ( segsize != ldu ) full = 0; |
| if ( segsize > ldu ) ldu = segsize; |
| } |
| } |
| #if ( DEBUGlevel>=3 ) |
| ++num_update; |
| #endif |
| if ( full ) { |
| tempu = &uval[rukp]; |
| } else { /* Copy block U(k,j) into tempU2d. */ |
| #if ( DEBUGlevel>=3 ) |
| printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", |
| iam, full, k, jb, ldu, ncols, nsupc); |
| ++num_copy; |
| #endif |
| tempu = tempU2d; |
| for (jj = iukp; jj < iukp+nsupc; ++jj) { |
| segsize = klst - usub[jj]; |
| if ( segsize ) { |
| lead_zero = ldu - segsize; |
| for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; |
| tempu += lead_zero; |
| for (i = 0; i < segsize; ++i) |
| tempu[i] = uval[rukp+i]; |
| rukp += segsize; |
| tempu += segsize; |
| } |
| } |
| tempu = tempU2d; |
| rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ |
| } /* if full ... */ |
| for (lb = 0; lb < nlb; ++lb) { |
| ib = lsub[lptr]; /* Row block L(i,k). */ |
| nbrow = lsub[lptr+1]; /* Number of full rows. */ |
| lptr += LB_DESCRIPTOR; /* Skip descriptor. */ |
| tempv = tempv2d; |
| #ifdef _CRAY |
| SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, |
| &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, |
| tempu, &ldu, &beta, tempv, &ldt); |
| #else |
| dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, |
| &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, |
| tempu, &ldu, &beta, tempv, &ldt); |
| #endif |
| stat->ops[FACT] += 2 * nbrow * ldu * ncols; |
| /* Now gather the result into the destination block. */ |
| if ( ib < jb ) { /* A(i,j) is in U. */ |
| ilst = FstBlockC( ib+1 ); |
| lib = LBi( ib, grid ); |
| index = Ufstnz_br_ptr[lib]; |
| ijb = index[iuip[lib]]; |
| while ( ijb < jb ) { /* Search for dest block. */ |
| ruip[lib] += index[iuip[lib]+1]; |
| iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); |
| ijb = index[iuip[lib]]; |
| } |
| iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */ |
| tempv = tempv2d; |
| for (jj = 0; jj < nsupc; ++jj) { |
| segsize = klst - usub[iukp + jj]; |
| fnz = index[iuip[lib]++]; |
| if ( segsize ) { /* Nonzero segment in U(k.j). */ |
| ucol = &Unzval_br_ptr[lib][ruip[lib]]; |
| for (i = 0, it = 0; i < nbrow; ++i) { |
| rel = lsub[lptr + i] - fnz; |
| ucol[rel] -= tempv[it++]; |
| } |
| tempv += ldt; |
| } |
| ruip[lib] += ilst - fnz; |
| } |
| } else { /* A(i,j) is in L. */ |
| index = Lrowind_bc_ptr[ljb]; |
| ldv = index[1]; /* LDA of the dest lusup. */ |
| lptrj = BC_HEADER; |
| luptrj = 0; |
| ijb = index[lptrj]; |
| while ( ijb != ib ) { /* Search for dest block -- |
| blocks are not ordered! */ |
| luptrj += index[lptrj+1]; |
| lptrj += LB_DESCRIPTOR + index[lptrj+1]; |
| ijb = index[lptrj]; |
| } |
| /* |
| * Build indirect table. This is needed because the |
| * indices are not sorted. |
| */ |
| fnz = FstBlockC( ib ); |
| lptrj += LB_DESCRIPTOR; |
| for (i = 0; i < index[lptrj-1]; ++i) { |
| rel = index[lptrj + i] - fnz; |
| indirect[rel] = i; |
| } |
| nzval = Lnzval_bc_ptr[ljb] + luptrj; |
| tempv = tempv2d; |
| for (jj = 0; jj < nsupc; ++jj) { |
| segsize = klst - usub[iukp + jj]; |
| if ( segsize ) { |
| /*#pragma _CRI cache_bypass nzval,tempv*/ |
| for (it = 0, i = 0; i < nbrow; ++i) { |
| rel = lsub[lptr + i] - fnz; |
| nzval[indirect[rel]] -= tempv[it++]; |
| } |
| tempv += ldt; |
| } |
| nzval += ldv; |
| } |
| } /* if ib < jb ... */ |
| lptr += nbrow; |
| luptr += nbrow; |
| } /* for lb ... */ |
| rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ |
| iukp += nsupc; |
| } /* if jb == k+1 */ |
| } /* if L(:,k) and U(k,:) not empty */ |
| if ( k+1 < nsupers ) { |
| kcol = PCOL( k+1, grid ); |
| if ( mycol == kcol ) { |
| #if ( VAMPIR>=1 ) |
| VT_begin(5); |
| #endif |
| /* Factor diagonal and subdiagonal blocks and test for exact |
| singularity. */ |
| pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, stat, info); |
| #if ( VAMPIR>=1 ) |
| VT_end(5); |
| #endif |
| /* Process column *kcol+1* multicasts numeric values of L(:,k+1) |
| to process rows. */ |
| lk = LBj( k+1, grid ); /* Local block number. */ |
| lsub1 = Lrowind_bc_ptr[lk]; |
| if ( lsub1 ) { |
| msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR; |
| msgcnt[1] = lsub1[1] * SuperSize( k+1 ); |
| } else { |
| msgcnt[0] = 0; |
| msgcnt[1] = 0; |
| } |
| scp = &grid->rscp; /* The scope of process row. */ |
| for (pj = 0; pj < Pc; ++pj) { |
| if ( ToSendR[lk][pj] != EMPTY ) { |
| lusup1 = Lnzval_bc_ptr[lk]; |
| #if ( PROFlevel>=1 ) |
| TIC(t1); |
| #endif |
| #if ( VAMPIR>=1 ) |
| VT_begin(1); |
| #endif |
| MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj, |
| (4*(k+1))%NTAGS, scp->comm, &send_req[pj] ); |
| MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj, |
| (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] ); |
| #if ( VAMPIR>=1 ) |
| VT_end(1); |
| #endif |
| #if ( PROFlevel>=1 ) |
| TOC(t2, t1); |
| stat->utime[COMM] += t2; |
| msg_cnt += 2; |
| msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; |
| #endif |
| #if ( DEBUGlevel>=2 ) |
| printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", |
| iam, k+1, msgcnt[0], msgcnt[1], pj); |
| #endif |
| } |
| } /* for pj ... */ |
| } else { /* Post Recv of block column L(:,k+1). */ |
| if ( ToRecv[k+1] >= 1 ) { |
| scp = &grid->rscp; /* The scope of process row. */ |
| MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol, |
| (4*(k+1))%NTAGS, scp->comm, &recv_req[0]); |
| MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, |
| (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]); |
| #if ( DEBUGlevel>=2 ) |
| printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1); |
| #endif |
| } |
| } /* if mycol == Pc(k+1) */ |
| } /* if k+1 < nsupers */ |
| if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ |
| /* --------------------------------------------------- |
| Update all other blocks using block row U(k,:) |
| --------------------------------------------------- */ |
| for (j = 0; j < nub; ++j) { |
| lptr = lptr0; |
| luptr = luptr0; |
| jb = usub[iukp]; /* Global block number of block U(k,j). */ |
| ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ |
| nsupc = SuperSize( jb ); |
| iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ |
| /* Prepare to call DGEMM. */ |
| jj = iukp; |
| while ( usub[jj] == klst ) ++jj; |
| ldu = klst - usub[jj++]; |
| ncols = 1; |
| full = 1; |
| for (; jj < iukp+nsupc; ++jj) { |
| segsize = klst - usub[jj]; |
| if ( segsize ) { |
| ++ncols; |
| if ( segsize != ldu ) full = 0; |
| if ( segsize > ldu ) ldu = segsize; |
| } |
| } |
| #if ( DEBUGlevel>=3 ) |
| printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", |
| iam, full, k, jb, ldu, ncols, nsupc); |
| ++num_update; |
| #endif |
| if ( full ) { |
| tempu = &uval[rukp]; |
| } else { /* Copy block U(k,j) into tempU2d. */ |
| #if ( DEBUGlevel>=3 ) |
| ++num_copy; |
| #endif |
| tempu = tempU2d; |
| for (jj = iukp; jj < iukp+nsupc; ++jj) { |
| segsize = klst - usub[jj]; |
| if ( segsize ) { |
| lead_zero = ldu - segsize; |
| for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; |
| tempu += lead_zero; |
| for (i = 0; i < segsize; ++i) |
| tempu[i] = uval[rukp+i]; |
| rukp += segsize; |
| tempu += segsize; |
| } |
| } |
| tempu = tempU2d; |
| rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ |
| } /* if full ... */ |
| for (lb = 0; lb < nlb; ++lb) { |
| ib = lsub[lptr]; /* Row block L(i,k). */ |
| nbrow = lsub[lptr+1]; /* Number of full rows. */ |
| lptr += LB_DESCRIPTOR; /* Skip descriptor. */ |
| tempv = tempv2d; |
| #ifdef _CRAY |
| SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, |
| &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, |
| tempu, &ldu, &beta, tempv, &ldt); |
| #else |
| dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, |
| &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, |
| tempu, &ldu, &beta, tempv, &ldt); |
| #endif |
| stat->ops[FACT] += 2 * nbrow * ldu * ncols; |
| /* Now gather the result into the destination block. */ |
| if ( ib < jb ) { /* A(i,j) is in U. */ |
| ilst = FstBlockC( ib+1 ); |
| lib = LBi( ib, grid ); |
| index = Ufstnz_br_ptr[lib]; |
| ijb = index[iuip[lib]]; |
| while ( ijb < jb ) { /* Search for dest block. */ |
| ruip[lib] += index[iuip[lib]+1]; |
| iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); |
| ijb = index[iuip[lib]]; |
| } |
| /* Skip descriptor. Now point to fstnz index of |
| block U(i,j). */ |
| iuip[lib] += UB_DESCRIPTOR; |
| tempv = tempv2d; |
| for (jj = 0; jj < nsupc; ++jj) { |
| segsize = klst - usub[iukp + jj]; |
| fnz = index[iuip[lib]++]; |
| if ( segsize ) { /* Nonzero segment in U(k.j). */ |
| ucol = &Unzval_br_ptr[lib][ruip[lib]]; |
| for (i = 0 ; i < nbrow; ++i) { |
| rel = lsub[lptr + i] - fnz; |
| ucol[rel] -= tempv[i]; |
| } |
| tempv += ldt; |
| } |
| ruip[lib] += ilst - fnz; |
| } |
| } else { /* A(i,j) is in L. */ |
| index = Lrowind_bc_ptr[ljb]; |
| ldv = index[1]; /* LDA of the dest lusup. */ |
| lptrj = BC_HEADER; |
| luptrj = 0; |
| ijb = index[lptrj]; |
| while ( ijb != ib ) { /* Search for dest block -- |
| blocks are not ordered! */ |
| luptrj += index[lptrj+1]; |
| lptrj += LB_DESCRIPTOR + index[lptrj+1]; |
| ijb = index[lptrj]; |
| } |
| /* |
| * Build indirect table. This is needed because the |
| * indices are not sorted for the L blocks. |
| */ |
| fnz = FstBlockC( ib ); |
| lptrj += LB_DESCRIPTOR; |
| for (i = 0; i < index[lptrj-1]; ++i) { |
| rel = index[lptrj + i] - fnz; |
| indirect[rel] = i; |
| } |
| nzval = Lnzval_bc_ptr[ljb] + luptrj; |
| tempv = tempv2d; |
| for (jj = 0; jj < nsupc; ++jj) { |
| segsize = klst - usub[iukp + jj]; |
| if ( segsize ) { |
| /*#pragma _CRI cache_bypass nzval,tempv*/ |
| for (i = 0; i < nbrow; ++i) { |
| rel = lsub[lptr + i] - fnz; |
| nzval[indirect[rel]] -= tempv[i]; |
| } |
| tempv += ldt; |
| } |
| nzval += ldv; |
| } |
| } /* if ib < jb ... */ |
| lptr += nbrow; |
| luptr += nbrow; |
| } /* for lb ... */ |
| rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ |
| iukp += nsupc; |
| } /* for j ... */ |
| } /* if k L(:,k) and U(k,:) are not empty */ |
| } |
| /* ------------------------------------------ |
| END MAIN LOOP: for k = ... |
| ------------------------------------------ */ |
| #if ( VAMPIR>=1 ) |
| VT_end(100); |
| VT_traceoff(); |
| #endif |
| if ( Pr*Pc > 1 ) { |
| SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ |
| SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */ |
| if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf); |
| if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf); |
| SUPERLU_FREE(send_req); |
| } |
| SUPERLU_FREE(Llu->ujrow); |
| SUPERLU_FREE(tempv2d); |
| SUPERLU_FREE(indirect); |
| SUPERLU_FREE(iuip); |
| SUPERLU_FREE(ruip); |
| /* Prepare error message. */ |
| if ( *info == 0 ) *info = n + 1; |
| #if ( PROFlevel>=1 ) |
| TIC(t1); |
| #endif |
| MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm ); |
| #if ( PROFlevel>=1 ) |
| TOC(t2, t1); |
| stat->utime[COMM] += t2; |
| { |
| float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; |
| MPI_Reduce( &msg_cnt, &msg_cnt_sum, |
| 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); |
| MPI_Reduce( &msg_cnt, &msg_cnt_max, |
| 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); |
| MPI_Reduce( &msg_vol, &msg_vol_sum, |
| 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); |
| MPI_Reduce( &msg_vol, &msg_vol_max, |
| 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); |
| if ( !iam ) { |
| printf("\tPDGSTRF comm stat:" |
| "\tAvg\tMax\t\tAvg\tMax\n" |
| "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", |
| msg_cnt_sum/Pr/Pc, msg_cnt_max, |
| msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6); |
| } |
| } |
| #endif |
| if ( iinfo == n + 1 ) *info = 0; |
| else *info = iinfo; |
| #if ( PRNTlevel==3 ) |
| MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); |
| if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo); |
| MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); |
| if ( !iam ) printf(".. # total msg\t%d\n", iinfo); |
| #endif |
| #if ( PRNTlevel==2 ) |
| for (i = 0; i < Pr * Pc; ++i) { |
| if ( iam == i ) { |
| dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); |
| dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); |
| printf("(%d)\n", iam); |
| PrintInt10("Recv", nsupers, Llu->ToRecv); |
| } |
| MPI_Barrier( grid->comm ); |
| } |
| #endif |
| #if ( DEBUGlevel>=3 ) |
| printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); |
| #endif |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Exit pdgstrf()"); |
| #endif |
| } /* PDGSTRF */ |
| /************************************************************************/ |
| /*! \brief |
| * |
| * <pre> |
| * Purpose |
| * ======= |
| * Factor diagonal and subdiagonal blocks and test for exact singularity. |
| * Only the process column that owns block column *k* participates |
| * in the work. |
| * |
| * Arguments |
| * ========= |
| * |
| * k (input) int (global) |
| * The column number of the block column to be factorized. |
| * |
| * thresh (input) double (global) |
| * The threshold value = s_eps * anorm. |
| * |
| * Glu_persist (input) Glu_persist_t* |
| * Global data structures (xsup, supno) replicated on all processes. |
| * |
| * grid (input) gridinfo_t* |
| * The 2D process mesh. |
| * |
| * Llu (input/output) LocalLU_t* |
| * Local data structures to store distributed L and U matrices. |
| * |
| * stat (output) SuperLUStat_t* |
| * Record the statistics about the factorization. |
| * See SuperLUStat_t structure defined in util.h. |
| * |
| * info (output) int* |
| * = 0: successful exit |
| * < 0: if info = -i, the i-th argument had an illegal value |
| * > 0: if info = i, U(i,i) is exactly zero. The factorization has |
| * been completed, but the factor U is exactly singular, |
| * and division by zero will occur if it is used to solve a |
| * system of equations. |
| * </pre> |
| */ |
| static void pdgstrf2 |
| /************************************************************************/ |
| ( |
| superlu_options_t *options, |
| int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid, |
| LocalLU_t *Llu, SuperLUStat_t *stat, int* info |
| ) |
| { |
| int c, iam, l, pkk; |
| int incx = 1, incy = 1; |
| int nsupr; /* number of rows in the block (LDA) */ |
| int luptr; |
| int_t i, krow, j, jfst, jlst; |
| int_t nsupc; /* number of columns in the block */ |
| int_t *xsup = Glu_persist->xsup; |
| double *lusup, temp; |
| double *ujrow; |
| double alpha = -1; |
| *info = 0; |
| /* Quick return. */ |
| /* Initialization. */ |
| iam = grid->iam; |
| krow = PROW( k, grid ); |
| pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); |
| j = LBj( k, grid ); /* Local block number */ |
| jfst = FstBlockC( k ); |
| jlst = FstBlockC( k+1 ); |
| lusup = Llu->Lnzval_bc_ptr[j]; |
| nsupc = SuperSize( k ); |
| if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1]; |
| ujrow = Llu->ujrow; |
| luptr = 0; /* Point to the diagonal entries. */ |
| c = nsupc; |
| for (j = 0; j < jlst - jfst; ++j) { |
| /* Broadcast the j-th row (nsupc - j) elements to |
| the process column. */ |
| if ( iam == pkk ) { /* Diagonal process. */ |
| i = luptr; |
| if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) { |
| if ( fabs(lusup[i]) < thresh ) { /* Diagonal */ |
| #if ( PRNTlevel>=2 ) |
| printf("(%d) .. col %d, tiny pivot %e ", |
| iam, jfst+j, lusup[i]); |
| #endif |
| /* Keep the replaced diagonal with the same sign. */ |
| if ( lusup[i] < 0 ) lusup[i] = -thresh; |
| else lusup[i] = thresh; |
| #if ( PRNTlevel>=2 ) |
| printf("replaced by %e\n", lusup[i]); |
| #endif |
| ++(stat->TinyPivots); |
| } |
| } |
| for (l = 0; l < c; ++l, i += nsupr) ujrow[l] = lusup[i]; |
| } |
| #if 0 |
| dbcast_col(ujrow, c, pkk, UjROW, grid, &c); |
| #else |
| MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm); |
| /*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS, |
| grid, COMM_COLUMN, &c);*/ |
| #endif |
| #if ( DEBUGlevel>=2 ) |
| if ( k == 3329 && j == 2 ) { |
| if ( iam == pkk ) { |
| printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]); |
| } else { |
| printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]); |
| } |
| } |
| #endif |
| if ( !lusup ) { /* Empty block column. */ |
| --c; |
| if ( ujrow[0] == 0.0 ) *info = j+jfst+1; |
| continue; |
| } |
| /* Test for singularity. */ |
| if ( ujrow[0] == 0.0 ) { |
| *info = j+jfst+1; |
| } else { |
| /* Scale the j-th column of the matrix. */ |
| temp = 1.0 / ujrow[0]; |
| if ( iam == pkk ) { |
| for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp; |
| stat->ops[FACT] += nsupr-j-1; |
| } else { |
| for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp; |
| stat->ops[FACT] += nsupr; |
| } |
| } |
| /* Rank-1 update of the trailing submatrix. */ |
| if ( --c ) { |
| if ( iam == pkk ) { |
| l = nsupr - j - 1; |
| #ifdef _CRAY |
| SGER(&l, &c, &alpha, &lusup[luptr+1], &incx, |
| &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); |
| #else |
| dger_(&l, &c, &alpha, &lusup[luptr+1], &incx, |
| &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); |
| #endif |
| stat->ops[FACT] += 2 * l * c; |
| } else { |
| #ifdef _CRAY |
| SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx, |
| &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); |
| #else |
| dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx, |
| &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); |
| #endif |
| stat->ops[FACT] += 2 * nsupr * c; |
| } |
| } |
| /* Move to the next column. */ |
| if ( iam == pkk ) luptr += nsupr + 1; |
| else luptr += nsupr; |
| } /* for j ... */ |
| } /* PDGSTRF2 */ |
| /************************************************************************/ |
| /*! \brief |
| * |
| * <pre> |
| * Purpose |
| * ======= |
| * Perform parallel triangular solves |
| * U(k,:) := A(k,:) \ L(k,k). |
| * Only the process column that owns block column *k* participates |
| * in the work. |
| * |
| * Arguments |
| * ========= |
| * |
| * m (input) int (global) |
| * Number of rows in the matrix. |
| * |
| * k (input) int (global) |
| * The row number of the block row to be factorized. |
| * |
| * Glu_persist (input) Glu_persist_t* |
| * Global data structures (xsup, supno) replicated on all processes. |
| * |
| * grid (input) gridinfo_t* |
| * The 2D process mesh. |
| * |
| * Llu (input/output) LocalLU_t* |
| * Local data structures to store distributed L and U matrices. |
| * |
| * stat (output) SuperLUStat_t* |
| * Record the statistics about the factorization; |
| * See SuperLUStat_t structure defined in util.h. |
| * </pre> |
| */ |
| static void pdgstrs2 |
| /************************************************************************/ |
| #ifdef _CRAY |
| ( |
| int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, |
| LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3 |
| ) |
| #else |
| ( |
| int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, |
| LocalLU_t *Llu, SuperLUStat_t *stat |
| ) |
| #endif |
| { |
| int iam, pkk; |
| int incx = 1; |
| int nsupr; /* number of rows in the block L(:,k) (LDA) */ |
| int segsize; |
| int_t nsupc; /* number of columns in the block */ |
| int_t luptr, iukp, rukp; |
| int_t b, gb, j, klst, knsupc, lk, nb; |
| int_t *xsup = Glu_persist->xsup; |
| int_t *usub; |
| double *lusup, *uval; |
| /* Quick return. */ |
| lk = LBi( k, grid ); /* Local block number */ |
| if ( !Llu->Unzval_br_ptr[lk] ) return; |
| /* Initialization. */ |
| iam = grid->iam; |
| pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); |
| klst = FstBlockC( k+1 ); |
| knsupc = SuperSize( k ); |
| usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ |
| uval = Llu->Unzval_br_ptr[lk]; |
| nb = usub[0]; |
| iukp = BR_HEADER; |
| rukp = 0; |
| if ( iam == pkk ) { |
| lk = LBj( k, grid ); |
| nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ |
| lusup = Llu->Lnzval_bc_ptr[lk]; |
| } else { |
| nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */ |
| lusup = Llu->Lval_buf_2[k%2]; |
| } |
| /* Loop through all the row blocks. */ |
| for (b = 0; b < nb; ++b) { |
| gb = usub[iukp]; |
| nsupc = SuperSize( gb ); |
| iukp += UB_DESCRIPTOR; |
| /* Loop through all the segments in the block. */ |
| for (j = 0; j < nsupc; ++j) { |
| segsize = klst - usub[iukp++]; |
| if ( segsize ) { /* Nonzero segment. */ |
| luptr = (knsupc - segsize) * (nsupr + 1); |
| #ifdef _CRAY |
| STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, |
| &uval[rukp], &incx); |
| #else |
| dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, |
| &uval[rukp], &incx); |
| #endif |
| stat->ops[FACT] += segsize * (segsize + 1); |
| rukp += segsize; |
| } |
| } |
| } /* for b ... */ |
| } /* PDGSTRS2 */ |
| static int |
| probe_recv(int iam, int source, int tag, MPI_Datatype datatype, MPI_Comm comm, |
| int buf_size) |
| { |
| MPI_Status status; |
| int count; |
| MPI_Probe( source, tag, comm, &status ); |
| MPI_Get_count( &status, datatype, &count ); |
| if ( count > buf_size ) { |
| printf("(%d) Recv'ed count %d > buffer size $d\n", |
| iam, count, buf_size); |
| exit(-1); |
| } |
| return 0; |
| } |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/ddistribute.c |
|---|
| New file |
| 0,0 → 1,737 |
| /*! @file |
| * \brief Distribute the matrix onto the 2D process mesh. |
| * |
| * <pre> |
| * -- Distributed SuperLU routine (version 2.3) -- |
| * Lawrence Berkeley National Lab, Univ. of California Berkeley. |
| * October 15, 2008 |
| * </pre> |
| */ |
| #include "superlu_ddefs.h" |
| /*! \brief |
| * |
| * <pre> |
| * Purpose |
| * ======= |
| * Distribute the matrix onto the 2D process mesh. |
| * |
| * Arguments |
| * ========= |
| * |
| * fact (input) fact_t |
| * Specifies whether or not the L and U structures will be re-used. |
| * = SamePattern_SameRowPerm: L and U structures are input, and |
| * unchanged on exit. |
| * = DOFACT or SamePattern: L and U structures are computed and output. |
| * |
| * n (input) int |
| * Dimension of the matrix. |
| * |
| * A (input) SuperMatrix* |
| * The original matrix A, permuted by columns, of dimension |
| * (A->nrow, A->ncol). The type of A can be: |
| * Stype = SLU_NCP; Dtype = SLU_D; Mtype = SLU_GE. |
| * |
| * LUstruct (input) LUstruct_t* |
| * Data structures for L and U factors. |
| * |
| * grid (input) gridinfo_t* |
| * The 2D process mesh. |
| * |
| * Return value |
| * ============ |
| * > 0, working storage required (in bytes). |
| * </pre> |
| */ |
| float |
| ddistribute(fact_t fact, int_t n, SuperMatrix *A, |
| Glu_freeable_t *Glu_freeable, |
| LUstruct_t *LUstruct, gridinfo_t *grid) |
| { |
| Glu_persist_t *Glu_persist = LUstruct->Glu_persist; |
| LocalLU_t *Llu = LUstruct->Llu; |
| int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, jb, jj, k, |
| len, len1, nsupc; |
| int_t ljb; /* local block column number */ |
| int_t nrbl; /* number of L blocks in current block column */ |
| int_t nrbu; /* number of U blocks in current block column */ |
| int_t gb; /* global block number; 0 < gb <= nsuper */ |
| int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ |
| int iam, jbrow, kcol, mycol, myrow, pc, pr; |
| int_t mybufmax[NBUFFERS]; |
| NCPformat *Astore; |
| double *a; |
| int_t *asub; |
| int_t *xa_begin, *xa_end; |
| int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ |
| int_t *supno = Glu_persist->supno; |
| int_t *lsub, *xlsub, *usub, *xusub; |
| int_t nsupers; |
| int_t next_lind; /* next available position in index[*] */ |
| int_t next_lval; /* next available position in nzval[*] */ |
| int_t *index; /* indices consist of headers and row subscripts */ |
| double *lusup, *uval; /* nonzero values in L and U */ |
| double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ |
| int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ |
| double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ |
| int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ |
| /*-- Counts to be used in factorization. --*/ |
| int_t *ToRecv, *ToSendD, **ToSendR; |
| /*-- Counts to be used in lower triangular solve. --*/ |
| int_t *fmod; /* Modification count for L-solve. */ |
| int_t **fsendx_plist; /* Column process list to send down Xk. */ |
| int_t nfrecvx = 0; /* Number of Xk I will receive. */ |
| int_t nfsendx = 0; /* Number of Xk I will send */ |
| int_t kseen; |
| /*-- Counts to be used in upper triangular solve. --*/ |
| int_t *bmod; /* Modification count for U-solve. */ |
| int_t **bsendx_plist; /* Column process list to send down Xk. */ |
| int_t nbrecvx = 0; /* Number of Xk I will receive. */ |
| int_t nbsendx = 0; /* Number of Xk I will send */ |
| int_t *ilsum; /* starting position of each supernode in |
| the full array (local) */ |
| /*-- Auxiliary arrays; freed on return --*/ |
| int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ |
| int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ |
| int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ |
| int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ |
| int_t *Ucbs; /* number of column blocks in a block row */ |
| int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ |
| int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ |
| int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ |
| int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ |
| double *dense, *dense_col; /* SPA */ |
| double zero = 0.0; |
| int_t ldaspa; /* LDA of SPA */ |
| int_t iword, dword; |
| float mem_use = 0.0; |
| #if ( PRNTlevel>=1 ) |
| int_t nLblocks = 0, nUblocks = 0; |
| #endif |
| #if ( PROFlevel>=1 ) |
| double t, t_u, t_l; |
| int_t u_blks; |
| #endif |
| /* Initialization. */ |
| iam = grid->iam; |
| myrow = MYROW( iam, grid ); |
| mycol = MYCOL( iam, grid ); |
| for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; |
| nsupers = supno[n-1] + 1; |
| Astore = A->Store; |
| a = Astore->nzval; |
| asub = Astore->rowind; |
| xa_begin = Astore->colbeg; |
| xa_end = Astore->colend; |
| #if ( PRNTlevel>=1 ) |
| iword = sizeof(int_t); |
| dword = sizeof(double); |
| #endif |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Enter ddistribute()"); |
| #endif |
| if ( fact == SamePattern_SameRowPerm ) { |
| /* --------------------------------------------------------------- |
| * REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION. |
| * --------------------------------------------------------------- */ |
| #if ( PROFlevel>=1 ) |
| t_l = t_u = 0; u_blks = 0; |
| #endif |
| /* We can propagate the new values of A into the existing |
| L and U data structures. */ |
| ilsum = Llu->ilsum; |
| ldaspa = Llu->ldalsum; |
| if ( !(dense = doubleCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) ) |
| ABORT("Calloc fails for SPA dense[]."); |
| nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ |
| if ( !(Urb_length = intCalloc_dist(nrbu)) ) |
| ABORT("Calloc fails for Urb_length[]."); |
| if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) |
| ABORT("Malloc fails for Urb_indptr[]."); |
| Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; |
| Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; |
| Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; |
| Unzval_br_ptr = Llu->Unzval_br_ptr; |
| #if ( PRNTlevel>=1 ) |
| mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; |
| #endif |
| #if ( PROFlevel>=1 ) |
| t = SuperLU_timer_(); |
| #endif |
| /* Initialize Uval to zero. */ |
| for (lb = 0; lb < nrbu; ++lb) { |
| Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ |
| index = Ufstnz_br_ptr[lb]; |
| if ( index ) { |
| uval = Unzval_br_ptr[lb]; |
| len = index[1]; |
| for (i = 0; i < len; ++i) uval[i] = zero; |
| } /* if index != NULL */ |
| } /* for lb ... */ |
| for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ |
| pc = PCOL( jb, grid ); |
| if ( mycol == pc ) { /* Block column jb in my process column */ |
| fsupc = FstBlockC( jb ); |
| nsupc = SuperSize( jb ); |
| /* Scatter A into SPA (for L), or into U directly. */ |
| for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { |
| for (i = xa_begin[j]; i < xa_end[j]; ++i) { |
| irow = asub[i]; |
| gb = BlockNum( irow ); |
| if ( myrow == PROW( gb, grid ) ) { |
| lb = LBi( gb, grid ); |
| if ( gb < jb ) { /* in U */ |
| index = Ufstnz_br_ptr[lb]; |
| uval = Unzval_br_ptr[lb]; |
| while ( (k = index[Urb_indptr[lb]]) < jb ) { |
| /* Skip nonzero values in this block */ |
| Urb_length[lb] += index[Urb_indptr[lb]+1]; |
| /* Move pointer to the next block */ |
| Urb_indptr[lb] += UB_DESCRIPTOR |
| + SuperSize( k ); |
| } |
| /*assert(k == jb);*/ |
| /* start fstnz */ |
| istart = Urb_indptr[lb] + UB_DESCRIPTOR; |
| len = Urb_length[lb]; |
| fsupc1 = FstBlockC( gb+1 ); |
| k = j - fsupc; |
| /* Sum the lengths of the leading columns */ |
| for (jj = 0; jj < k; ++jj) |
| len += fsupc1 - index[istart++]; |
| /*assert(irow>=index[istart]);*/ |
| uval[len + irow - index[istart]] = a[i]; |
| } else { /* in L; put in SPA first */ |
| irow = ilsum[lb] + irow - FstBlockC( gb ); |
| dense_col[irow] = a[i]; |
| } |
| } |
| } /* for i ... */ |
| dense_col += ldaspa; |
| } /* for j ... */ |
| #if ( PROFlevel>=1 ) |
| t_u += SuperLU_timer_() - t; |
| t = SuperLU_timer_(); |
| #endif |
| /* Gather the values of A from SPA into Lnzval[]. */ |
| ljb = LBj( jb, grid ); /* Local block number */ |
| index = Lrowind_bc_ptr[ljb]; |
| if ( index ) { |
| nrbl = index[0]; /* Number of row blocks. */ |
| len = index[1]; /* LDA of lusup[]. */ |
| lusup = Lnzval_bc_ptr[ljb]; |
| next_lind = BC_HEADER; |
| next_lval = 0; |
| for (jj = 0; jj < nrbl; ++jj) { |
| gb = index[next_lind++]; |
| len1 = index[next_lind++]; /* Rows in the block. */ |
| lb = LBi( gb, grid ); |
| for (bnnz = 0; bnnz < len1; ++bnnz) { |
| irow = index[next_lind++]; /* Global index. */ |
| irow = ilsum[lb] + irow - FstBlockC( gb ); |
| k = next_lval++; |
| for (j = 0, dense_col = dense; j < nsupc; ++j) { |
| lusup[k] = dense_col[irow]; |
| dense_col[irow] = zero; |
| k += len; |
| dense_col += ldaspa; |
| } |
| } /* for bnnz ... */ |
| } /* for jj ... */ |
| } /* if index ... */ |
| #if ( PROFlevel>=1 ) |
| t_l += SuperLU_timer_() - t; |
| #endif |
| } /* if mycol == pc */ |
| } /* for jb ... */ |
| SUPERLU_FREE(dense); |
| SUPERLU_FREE(Urb_length); |
| SUPERLU_FREE(Urb_indptr); |
| #if ( PROFlevel>=1 ) |
| if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", |
| t_l, t_u, u_blks, nrbu); |
| #endif |
| } else { |
| /* -------------------------------------------------- |
| * FIRST TIME CREATING THE L AND U DATA STRUCTURE. |
| * -------------------------------------------------- */ |
| #if ( PROFlevel>=1 ) |
| t_l = t_u = 0; u_blks = 0; |
| #endif |
| /* No L and U data structures are available yet. |
| We need to set up the L and U data structures and propagate |
| the values of A into them. */ |
| lsub = Glu_freeable->lsub; /* compressed L subscripts */ |
| xlsub = Glu_freeable->xlsub; |
| usub = Glu_freeable->usub; /* compressed U subscripts */ |
| xusub = Glu_freeable->xusub; |
| if ( !(ToRecv = intCalloc_dist(nsupers)) ) |
| ABORT("Calloc fails for ToRecv[]."); |
| k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ |
| if ( !(ToSendR = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) |
| ABORT("Malloc fails for ToSendR[]."); |
| j = k * grid->npcol; |
| if ( !(index = intMalloc_dist(j)) ) |
| ABORT("Malloc fails for index[]."); |
| #if ( PRNTlevel>=1 ) |
| mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; |
| #endif |
| for (i = 0; i < j; ++i) index[i] = EMPTY; |
| for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index[j]; |
| k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ |
| /* Pointers to the beginning of each block row of U. */ |
| if ( !(Unzval_br_ptr = |
| (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) |
| ABORT("Malloc fails for Unzval_br_ptr[]."); |
| if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) |
| ABORT("Malloc fails for Ufstnz_br_ptr[]."); |
| if ( !(ToSendD = intCalloc_dist(k)) ) |
| ABORT("Malloc fails for ToSendD[]."); |
| if ( !(ilsum = intMalloc_dist(k+1)) ) |
| ABORT("Malloc fails for ilsum[]."); |
| /* Auxiliary arrays used to set up U block data structures. |
| They are freed on return. */ |
| if ( !(rb_marker = intCalloc_dist(k)) ) |
| ABORT("Calloc fails for rb_marker[]."); |
| if ( !(Urb_length = intCalloc_dist(k)) ) |
| ABORT("Calloc fails for Urb_length[]."); |
| if ( !(Urb_indptr = intMalloc_dist(k)) ) |
| ABORT("Malloc fails for Urb_indptr[]."); |
| if ( !(Urb_fstnz = intCalloc_dist(k)) ) |
| ABORT("Calloc fails for Urb_fstnz[]."); |
| if ( !(Ucbs = intCalloc_dist(k)) ) |
| ABORT("Calloc fails for Ucbs[]."); |
| #if ( PRNTlevel>=1 ) |
| mem_use += 2.0*k*sizeof(int_t*) + (7.0*k+1)*iword; |
| #endif |
| /* Compute ldaspa and ilsum[]. */ |
| ldaspa = 0; |
| ilsum[0] = 0; |
| for (gb = 0; gb < nsupers; ++gb) { |
| if ( myrow == PROW( gb, grid ) ) { |
| i = SuperSize( gb ); |
| ldaspa += i; |
| lb = LBi( gb, grid ); |
| ilsum[lb + 1] = ilsum[lb] + i; |
| } |
| } |
| /* ------------------------------------------------------------ |
| COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. |
| THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). |
| ------------------------------------------------------------*/ |
| /* Loop through each supernode column. */ |
| for (jb = 0; jb < nsupers; ++jb) { |
| pc = PCOL( jb, grid ); |
| fsupc = FstBlockC( jb ); |
| nsupc = SuperSize( jb ); |
| /* Loop through each column in the block. */ |
| for (j = fsupc; j < fsupc + nsupc; ++j) { |
| /* usub[*] contains only "first nonzero" in each segment. */ |
| for (i = xusub[j]; i < xusub[j+1]; ++i) { |
| irow = usub[i]; /* First nonzero of the segment. */ |
| gb = BlockNum( irow ); |
| kcol = PCOL( gb, grid ); |
| ljb = LBj( gb, grid ); |
| if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; |
| pr = PROW( gb, grid ); |
| lb = LBi( gb, grid ); |
| if ( mycol == pc ) { |
| if ( myrow == pr ) { |
| ToSendD[lb] = YES; |
| /* Count nonzeros in entire block row. */ |
| Urb_length[lb] += FstBlockC( gb+1 ) - irow; |
| if (rb_marker[lb] <= jb) {/* First see the block */ |
| rb_marker[lb] = jb + 1; |
| Urb_fstnz[lb] += nsupc; |
| ++Ucbs[lb]; /* Number of column blocks |
| in block row lb. */ |
| #if ( PRNTlevel>=1 ) |
| ++nUblocks; |
| #endif |
| } |
| ToRecv[gb] = 1; |
| } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ |
| } |
| } /* for i ... */ |
| } /* for j ... */ |
| } /* for jb ... */ |
| /* Set up the initial pointers for each block row in U. */ |
| nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ |
| for (lb = 0; lb < nrbu; ++lb) { |
| len = Urb_length[lb]; |
| rb_marker[lb] = 0; /* Reset block marker. */ |
| if ( len ) { |
| /* Add room for descriptors */ |
| len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; |
| if ( !(index = intMalloc_dist(len1+1)) ) |
| ABORT("Malloc fails for Uindex[]."); |
| Ufstnz_br_ptr[lb] = index; |
| if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) |
| ABORT("Malloc fails for Unzval_br_ptr[*][]."); |
| mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); |
| mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); |
| index[0] = Ucbs[lb]; /* Number of column blocks */ |
| index[1] = len; /* Total length of nzval[] */ |
| index[2] = len1; /* Total length of index[] */ |
| index[len1] = -1; /* End marker */ |
| } else { |
| Ufstnz_br_ptr[lb] = NULL; |
| Unzval_br_ptr[lb] = NULL; |
| } |
| Urb_length[lb] = 0; /* Reset block length. */ |
| Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ |
| Urb_fstnz[lb] = BR_HEADER; |
| } /* for lb ... */ |
| SUPERLU_FREE(Ucbs); |
| #if ( PROFlevel>=1 ) |
| t = SuperLU_timer_() - t; |
| if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); |
| #endif |
| #if ( PRNTlevel>=1 ) |
| mem_use -= 2.0*k * iword; |
| #endif |
| /* Auxiliary arrays used to set up L block data structures. |
| They are freed on return. |
| k is the number of local row blocks. */ |
| if ( !(Lrb_length = intCalloc_dist(k)) ) |
| ABORT("Calloc fails for Lrb_length[]."); |
| if ( !(Lrb_number = intMalloc_dist(k)) ) |
| ABORT("Malloc fails for Lrb_number[]."); |
| if ( !(Lrb_indptr = intMalloc_dist(k)) ) |
| ABORT("Malloc fails for Lrb_indptr[]."); |
| if ( !(Lrb_valptr = intMalloc_dist(k)) ) |
| ABORT("Malloc fails for Lrb_valptr[]."); |
| if (!(dense=doubleCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa) |
| *sp_ienv_dist(3))))) |
| ABORT("Calloc fails for SPA dense[]."); |
| /* These counts will be used for triangular solves. */ |
| if ( !(fmod = intCalloc_dist(k)) ) |
| ABORT("Calloc fails for fmod[]."); |
| if ( !(bmod = intCalloc_dist(k)) ) |
| ABORT("Calloc fails for bmod[]."); |
| #if ( PRNTlevel>=1 ) |
| mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword; |
| #endif |
| k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ |
| /* Pointers to the beginning of each block column of L. */ |
| if ( !(Lnzval_bc_ptr = (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) |
| ABORT("Malloc fails for Lnzval_bc_ptr[]."); |
| if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) |
| ABORT("Malloc fails for Lrowind_bc_ptr[]."); |
| Lrowind_bc_ptr[k-1] = NULL; |
| /* These lists of processes will be used for triangular solves. */ |
| if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) |
| ABORT("Malloc fails for fsendx_plist[]."); |
| len = k * grid->nprow; |
| if ( !(index = intMalloc_dist(len)) ) |
| ABORT("Malloc fails for fsendx_plist[0]"); |
| for (i = 0; i < len; ++i) index[i] = EMPTY; |
| for (i = 0, j = 0; i < k; ++i, j += grid->nprow) |
| fsendx_plist[i] = &index[j]; |
| if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) |
| ABORT("Malloc fails for bsendx_plist[]."); |
| if ( !(index = intMalloc_dist(len)) ) |
| ABORT("Malloc fails for bsendx_plist[0]"); |
| for (i = 0; i < len; ++i) index[i] = EMPTY; |
| for (i = 0, j = 0; i < k; ++i, j += grid->nprow) |
| bsendx_plist[i] = &index[j]; |
| #if ( PRNTlevel>=1 ) |
| mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; |
| #endif |
| /*------------------------------------------------------------ |
| PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. |
| THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. |
| ------------------------------------------------------------*/ |
| for (jb = 0; jb < nsupers; ++jb) { |
| pc = PCOL( jb, grid ); |
| if ( mycol == pc ) { /* Block column jb in my process column */ |
| fsupc = FstBlockC( jb ); |
| nsupc = SuperSize( jb ); |
| ljb = LBj( jb, grid ); /* Local block number */ |
| /* Scatter A into SPA. */ |
| for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){ |
| for (i = xa_begin[j]; i < xa_end[j]; ++i) { |
| irow = asub[i]; |
| gb = BlockNum( irow ); |
| if ( myrow == PROW( gb, grid ) ) { |
| lb = LBi( gb, grid ); |
| irow = ilsum[lb] + irow - FstBlockC( gb ); |
| dense_col[irow] = a[i]; |
| } |
| } |
| dense_col += ldaspa; |
| } |
| jbrow = PROW( jb, grid ); |
| #if ( PROFlevel>=1 ) |
| t = SuperLU_timer_(); |
| #endif |
| /*------------------------------------------------ |
| * SET UP U BLOCKS. |
| *------------------------------------------------*/ |
| kseen = 0; |
| dense_col = dense; |
| /* Loop through each column in the block column. */ |
| for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { |
| istart = xusub[j]; |
| /* NOTE: Only the first nonzero index of the segment |
| is stored in usub[]. */ |
| for (i = istart; i < xusub[j+1]; ++i) { |
| irow = usub[i]; /* First nonzero in the segment. */ |
| gb = BlockNum( irow ); |
| pr = PROW( gb, grid ); |
| if ( pr != jbrow && |
| myrow == jbrow && /* diag. proc. owning jb */ |
| bsendx_plist[ljb][pr] == EMPTY ) { |
| bsendx_plist[ljb][pr] = YES; |
| ++nbsendx; |
| } |
| if ( myrow == pr ) { |
| lb = LBi( gb, grid ); /* Local block number */ |
| index = Ufstnz_br_ptr[lb]; |
| uval = Unzval_br_ptr[lb]; |
| fsupc1 = FstBlockC( gb+1 ); |
| if (rb_marker[lb] <= jb) { /* First time see |
| the block */ |
| rb_marker[lb] = jb + 1; |
| Urb_indptr[lb] = Urb_fstnz[lb];; |
| index[Urb_indptr[lb]] = jb; /* Descriptor */ |
| Urb_indptr[lb] += UB_DESCRIPTOR; |
| /* Record the first location in index[] of the |
| next block */ |
| Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; |
| len = Urb_indptr[lb];/* Start fstnz in index */ |
| index[len-1] = 0; |
| for (k = 0; k < nsupc; ++k) |
| index[len+k] = fsupc1; |
| if ( gb != jb )/* Exclude diagonal block. */ |
| ++bmod[lb];/* Mod. count for back solve */ |
| if ( kseen == 0 && myrow != jbrow ) { |
| ++nbrecvx; |
| kseen = 1; |
| } |
| } else { /* Already saw the block */ |
| len = Urb_indptr[lb];/* Start fstnz in index */ |
| } |
| jj = j - fsupc; |
| index[len+jj] = irow; |
| /* Load the numerical values */ |
| k = fsupc1 - irow; /* No. of nonzeros in segment */ |
| index[len-1] += k; /* Increment block length in |
| Descriptor */ |
| irow = ilsum[lb] + irow - FstBlockC( gb ); |
| for (ii = 0; ii < k; ++ii) { |
| uval[Urb_length[lb]++] = dense_col[irow + ii]; |
| dense_col[irow + ii] = zero; |
| } |
| } /* if myrow == pr ... */ |
| } /* for i ... */ |
| dense_col += ldaspa; |
| } /* for j ... */ |
| #if ( PROFlevel>=1 ) |
| t_u += SuperLU_timer_() - t; |
| t = SuperLU_timer_(); |
| #endif |
| /*------------------------------------------------ |
| * SET UP L BLOCKS. |
| *------------------------------------------------*/ |
| /* Count number of blocks and length of each block. */ |
| nrbl = 0; |
| len = 0; /* Number of row subscripts I own. */ |
| kseen = 0; |
| istart = xlsub[fsupc]; |
| for (i = istart; i < xlsub[fsupc+1]; ++i) { |
| irow = lsub[i]; |
| gb = BlockNum( irow ); /* Global block number */ |
| pr = PROW( gb, grid ); /* Process row owning this block */ |
| if ( pr != jbrow && |
| myrow == jbrow && /* diag. proc. owning jb */ |
| fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { |
| fsendx_plist[ljb][pr] = YES; |
| ++nfsendx; |
| } |
| if ( myrow == pr ) { |
| lb = LBi( gb, grid ); /* Local block number */ |
| if (rb_marker[lb] <= jb) { /* First see this block */ |
| rb_marker[lb] = jb + 1; |
| Lrb_length[lb] = 1; |
| Lrb_number[nrbl++] = gb; |
| if ( gb != jb ) /* Exclude diagonal block. */ |
| ++fmod[lb]; /* Mod. count for forward solve */ |
| if ( kseen == 0 && myrow != jbrow ) { |
| ++nfrecvx; |
| kseen = 1; |
| } |
| #if ( PRNTlevel>=1 ) |
| ++nLblocks; |
| #endif |
| } else { |
| ++Lrb_length[lb]; |
| } |
| ++len; |
| } |
| } /* for i ... */ |
| if ( nrbl ) { /* Do not ensure the blocks are sorted! */ |
| /* Set up the initial pointers for each block in |
| index[] and nzval[]. */ |
| /* Add room for descriptors */ |
| len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; |
| if ( !(index = intMalloc_dist(len1)) ) |
| ABORT("Malloc fails for index[]"); |
| Lrowind_bc_ptr[ljb] = index; |
| if (!(Lnzval_bc_ptr[ljb] = doubleMalloc_dist(((size_t)len)*nsupc))) { |
| fprintf(stderr, "col block %d ", jb); |
| ABORT("Malloc fails for Lnzval_bc_ptr[*][]"); |
| } |
| mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); |
| mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); |
| mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); |
| index[0] = nrbl; /* Number of row blocks */ |
| index[1] = len; /* LDA of the nzval[] */ |
| next_lind = BC_HEADER; |
| next_lval = 0; |
| for (k = 0; k < nrbl; ++k) { |
| gb = Lrb_number[k]; |
| lb = LBi( gb, grid ); |
| len = Lrb_length[lb]; |
| Lrb_length[lb] = 0; /* Reset vector of block length */ |
| index[next_lind++] = gb; /* Descriptor */ |
| index[next_lind++] = len; |
| Lrb_indptr[lb] = next_lind; |
| Lrb_valptr[lb] = next_lval; |
| next_lind += len; |
| next_lval += len; |
| } |
| /* Propagate the compressed row subscripts to Lindex[], and |
| the initial values of A from SPA into Lnzval[]. */ |
| lusup = Lnzval_bc_ptr[ljb]; |
| len = index[1]; /* LDA of lusup[] */ |
| for (i = istart; i < xlsub[fsupc+1]; ++i) { |
| irow = lsub[i]; |
| gb = BlockNum( irow ); |
| if ( myrow == PROW( gb, grid ) ) { |
| lb = LBi( gb, grid ); |
| k = Lrb_indptr[lb]++; /* Random access a block */ |
| index[k] = irow; |
| k = Lrb_valptr[lb]++; |
| irow = ilsum[lb] + irow - FstBlockC( gb ); |
| for (j = 0, dense_col = dense; j < nsupc; ++j) { |
| lusup[k] = dense_col[irow]; |
| dense_col[irow] = 0.0; |
| k += len; |
| dense_col += ldaspa; |
| } |
| } |
| } /* for i ... */ |
| } else { |
| Lrowind_bc_ptr[ljb] = NULL; |
| Lnzval_bc_ptr[ljb] = NULL; |
| } /* if nrbl ... */ |
| #if ( PROFlevel>=1 ) |
| t_l += SuperLU_timer_() - t; |
| #endif |
| } /* if mycol == pc */ |
| } /* for jb ... */ |
| Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; |
| Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; |
| Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; |
| Llu->Unzval_br_ptr = Unzval_br_ptr; |
| Llu->ToRecv = ToRecv; |
| Llu->ToSendD = ToSendD; |
| Llu->ToSendR = ToSendR; |
| Llu->fmod = fmod; |
| Llu->fsendx_plist = fsendx_plist; |
| Llu->nfrecvx = nfrecvx; |
| Llu->nfsendx = nfsendx; |
| Llu->bmod = bmod; |
| Llu->bsendx_plist = bsendx_plist; |
| Llu->nbrecvx = nbrecvx; |
| Llu->nbsendx = nbsendx; |
| Llu->ilsum = ilsum; |
| Llu->ldalsum = ldaspa; |
| #if ( PRNTlevel>=1 ) |
| if ( !iam ) printf(".. # L blocks %d\t# U blocks %d\n", |
| nLblocks, nUblocks); |
| #endif |
| SUPERLU_FREE(rb_marker); |
| SUPERLU_FREE(Urb_fstnz); |
| SUPERLU_FREE(Urb_length); |
| SUPERLU_FREE(Urb_indptr); |
| SUPERLU_FREE(Lrb_length); |
| SUPERLU_FREE(Lrb_number); |
| SUPERLU_FREE(Lrb_indptr); |
| SUPERLU_FREE(Lrb_valptr); |
| SUPERLU_FREE(dense); |
| k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ |
| if ( !(Llu->mod_bit = intMalloc_dist(k)) ) |
| ABORT("Malloc fails for mod_bit[]."); |
| /* Find the maximum buffer size. */ |
| MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, |
| MPI_MAX, grid->comm); |
| #if ( PROFlevel>=1 ) |
| if ( !iam ) printf(".. 1st distribute time:\n " |
| "\tL\t%.2f\n\tU\t%.2f\n" |
| "\tu_blks %d\tnrbu %d\n--------\n", |
| t_l, t_u, u_blks, nrbu); |
| #endif |
| } /* else fact != SamePattern_SameRowPerm */ |
| #if ( DEBUGlevel>=1 ) |
| /* Memory allocated but not freed: |
| ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ |
| CHECK_MALLOC(iam, "Exit ddistribute()"); |
| #endif |
| return (mem_use); |
| } /* DDISTRIBUTE */ |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/pdgstrs_Bglobal_Bsend.c |
|---|
| New file |
| 0,0 → 1,1007 |
| /*! @file |
| * \brief Solves a system of distributed linear equations |
| * |
| * <pre> |
| * -- Distributed SuperLU routine (version 1.0) -- |
| * Lawrence Berkeley National Lab, Univ. of California Berkeley. |
| * September 1, 1999 |
| * |
| * Modified: |
| * Feburary 7, 2001 use MPI_Isend/MPI_Irecv |
| * October 2, 2001 use MPI_Isend/MPI_Irecv with MPI_Test |
| * </pre> |
| */ |
| #include "superlu_ddefs.h" |
| /*#define ISEND_IRECV*/ |
| /* Parry's change |
| Use MPI_Bsend with a large buffer attached in the main program */ |
| #define BSEND 1 |
| /* |
| * Function prototypes |
| */ |
| #ifdef _CRAY |
| fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, double*, |
| double*, int*, double*, int*); |
| fortran void SGEMM(_fcd, _fcd, int*, int*, int*, double*, double*, |
| int*, double*, int*, double*, double*, int*); |
| _fcd ftcs1; |
| _fcd ftcs2; |
| _fcd ftcs3; |
| #endif |
| /*! \brief |
| * |
| * <pre> |
| * Purpose |
| * ======= |
| * |
| * pdgstrs_Bglobal solves a system of distributed linear equations |
| * A*X = B with a general N-by-N matrix A using the LU factorization |
| * computed by pdgstrf. |
| * |
| * Arguments |
| * ========= |
| * |
| * n (input) int (global) |
| * The order of the system of linear equations. |
| * |
| * LUstruct (input) LUstruct_t* |
| * The distributed data structures storing L and U factors. |
| * The L and U factors are obtained from pdgstrf for |
| * the possibly scaled and permuted matrix A. |
| * See superlu_ddefs.h for the definition of 'LUstruct_t'. |
| * |
| * grid (input) gridinfo_t* |
| * The 2D process mesh. It contains the MPI communicator, the number |
| * of process rows (NPROW), the number of process columns (NPCOL), |
| * and my process rank. It is an input argument to all the |
| * parallel routines. |
| * Grid can be initialized by subroutine SUPERLU_GRIDINIT. |
| * See superlu_ddefs.h for the definition of 'gridinfo_t'. |
| * |
| * B (input/output) double* |
| * On entry, the right-hand side matrix of the possibly equilibrated |
| * and row permuted system. |
| * On exit, the solution matrix of the possibly equilibrated |
| * and row permuted system if info = 0; |
| * |
| * NOTE: Currently, the N-by-NRHS matrix B must reside on all |
| * processes when calling this routine. |
| * |
| * ldb (input) int (global) |
| * Leading dimension of matrix B. |
| * |
| * nrhs (input) int (global) |
| * Number of right-hand sides. |
| * |
| * stat (output) SuperLUStat_t* |
| * Record the statistics about the triangular solves. |
| * See util.h for the definition of 'SuperLUStat_t'. |
| * |
| * info (output) int* |
| * = 0: successful exit |
| * < 0: if info = -i, the i-th argument had an illegal value |
| * </pre> |
| */ |
| void |
| pdgstrs_Bglobal(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, double *B, |
| int_t ldb, int nrhs, SuperLUStat_t *stat, int *info) |
| { |
| Glu_persist_t *Glu_persist = LUstruct->Glu_persist; |
| LocalLU_t *Llu = LUstruct->Llu; |
| double alpha = 1.0; |
| double *lsum; /* Local running sum of the updates to B-components */ |
| double *x; /* X component at step k. */ |
| double *lusup, *dest; |
| double *recvbuf, *tempv; |
| double *rtemp; /* Result of full matrix-vector multiply. */ |
| int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; |
| int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ |
| Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ |
| int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ |
| int_t iam, kcol, krow, mycol, myrow; |
| int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; |
| int_t nb, nlb, nub, nsupers; |
| int_t *xsup, *lsub, *usub; |
| int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ |
| int_t Pc, Pr; |
| int knsupc, nsupr; |
| int ldalsum; /* Number of lsum entries locally owned. */ |
| int maxrecvsz, p, pi; |
| int_t **Lrowind_bc_ptr; |
| double **Lnzval_bc_ptr; |
| MPI_Status status; |
| #if defined(ISEND_IRECV) || defined(BSEND) |
| MPI_Request *send_req, recv_req; |
| int test_flag; |
| #endif |
| /*-- Counts used for L-solve --*/ |
| int_t *fmod; /* Modification count for L-solve. */ |
| int_t **fsendx_plist = Llu->fsendx_plist; |
| int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ |
| int_t *frecv; /* Count of modifications to be recv'd from |
| processes in this row. */ |
| int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ |
| int_t nleaf = 0, nroot = 0; |
| /*-- Counts used for U-solve --*/ |
| int_t *bmod; /* Modification count for L-solve. */ |
| int_t **bsendx_plist = Llu->bsendx_plist; |
| int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ |
| int_t *brecv; /* Count of modifications to be recv'd from |
| processes in this row. */ |
| int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ |
| double t; |
| #if ( DEBUGlevel>=2 ) |
| int_t Ublocks = 0; |
| #endif |
| /*-- Function prototypes --*/ |
| extern void gather_diag_to_all(int_t, int_t, double [], Glu_persist_t *, |
| LocalLU_t *, gridinfo_t *, int_t, int_t [], |
| int_t [], double [], int_t, double []); |
| t = SuperLU_timer_(); |
| /* Test input parameters. */ |
| *info = 0; |
| if ( n < 0 ) *info = -1; |
| else if ( nrhs < 0 ) *info = -9; |
| if ( *info ) { |
| pxerbla("PDGSTRS_BGLOBAL", grid, -*info); |
| return; |
| } |
| /* |
| * Initialization. |
| */ |
| iam = grid->iam; |
| #ifdef BSEND |
| if(!iam) { |
| printf("Using MPI_Bsend in triangular solve\n"); |
| fflush(stdout); |
| } |
| #endif |
| Pc = grid->npcol; |
| Pr = grid->nprow; |
| myrow = MYROW( iam, grid ); |
| mycol = MYCOL( iam, grid ); |
| nsupers = Glu_persist->supno[n-1] + 1; |
| xsup = Glu_persist->xsup; |
| Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; |
| Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; |
| nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ |
| stat->ops[SOLVE] = 0.0; |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Enter pdgstrs_Bglobal()"); |
| #endif |
| /* Save the count to be altered so it can be used by |
| subsequent call to PDGSTRS_BGLOBAL. */ |
| if ( !(fmod = intMalloc_dist(nlb)) ) |
| ABORT("Calloc fails for fmod[]."); |
| for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; |
| if ( !(frecv = intMalloc_dist(nlb)) ) |
| ABORT("Malloc fails for frecv[]."); |
| Llu->frecv = frecv; |
| #if defined(ISEND_IRECV) || defined(BSEND) |
| if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(Pr*sizeof(MPI_Request))) ) |
| ABORT("Malloc fails for send_req[]."); |
| for (i = 0; i < Pr; ++i) send_req[i] = MPI_REQUEST_NULL; |
| #endif |
| #ifdef _CRAY |
| ftcs1 = _cptofcd("L", strlen("L")); |
| ftcs2 = _cptofcd("N", strlen("N")); |
| ftcs3 = _cptofcd("U", strlen("U")); |
| #endif |
| /* Obtain ilsum[] and ldalsum for process column 0. */ |
| ilsum = Llu->ilsum; |
| ldalsum = Llu->ldalsum; |
| /* Allocate working storage. */ |
| knsupc = sp_ienv_dist(3); |
| maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); |
| if ( !(lsum = doubleCalloc_dist(((size_t)ldalsum) * nrhs + nlb * LSUM_H))) |
| ABORT("Calloc fails for lsum[]."); |
| if ( !(x = doubleMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) |
| ABORT("Malloc fails for x[]."); |
| if ( !(recvbuf = doubleMalloc_dist(maxrecvsz)) ) |
| ABORT("Malloc fails for recvbuf[]."); |
| if ( !(rtemp = doubleMalloc_dist(maxrecvsz)) ) |
| ABORT("Malloc fails for rtemp[]."); |
| /*--------------------------------------------------- |
| * Forward solve Ly = b. |
| *---------------------------------------------------*/ |
| /* |
| * Copy B into X on the diagonal processes. |
| */ |
| ii = 0; |
| for (k = 0; k < nsupers; ++k) { |
| knsupc = SuperSize( k ); |
| krow = PROW( k, grid ); |
| if ( myrow == krow ) { |
| lk = LBi( k, grid ); /* Local block number. */ |
| il = LSUM_BLK( lk ); |
| lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ |
| kcol = PCOL( k, grid ); |
| if ( mycol == kcol ) { /* Diagonal process. */ |
| jj = X_BLK( lk ); |
| x[jj - XK_H] = k; /* Block number prepended in the header. */ |
| RHS_ITERATE(j) |
| for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */ |
| x[i + jj + j*knsupc] = B[i + ii + j*ldb]; |
| } |
| } |
| ii += knsupc; |
| } |
| /* |
| * Compute frecv[] and nfrecvmod counts on the diagonal processes. |
| */ |
| { |
| superlu_scope_t *scp = &grid->rscp; |
| for (k = 0; k < nsupers; ++k) { |
| krow = PROW( k, grid ); |
| if ( myrow == krow ) { |
| lk = LBi( k, grid ); /* Local block number. */ |
| kcol = PCOL( k, grid ); /* Root process in this row scope. */ |
| if ( mycol != kcol && fmod[lk] ) |
| i = 1; /* Contribution from non-diagonal process. */ |
| else i = 0; |
| MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, |
| MPI_SUM, kcol, scp->comm ); |
| if ( mycol == kcol ) { /* Diagonal process. */ |
| nfrecvmod += frecv[lk]; |
| if ( !frecv[lk] && !fmod[lk] ) ++nleaf; |
| #if ( DEBUGlevel>=2 ) |
| printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); |
| assert( frecv[lk] < Pc ); |
| #endif |
| } |
| } |
| } |
| } |
| /* --------------------------------------------------------- |
| Solve the leaf nodes first by all the diagonal processes. |
| --------------------------------------------------------- */ |
| #if ( DEBUGlevel>=1 ) |
| printf("(%2d) nleaf %4d\n", iam, nleaf); |
| #endif |
| for (k = 0; k < nsupers && nleaf; ++k) { |
| krow = PROW( k, grid ); |
| kcol = PCOL( k, grid ); |
| if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ |
| knsupc = SuperSize( k ); |
| lk = LBi( k, grid ); |
| if ( frecv[lk]==0 && fmod[lk]==0 ) { |
| fmod[lk] = -1; /* Do not solve X[k] in the future. */ |
| ii = X_BLK( lk ); |
| lk = LBj( k, grid ); /* Local block number, column-wise. */ |
| lsub = Lrowind_bc_ptr[lk]; |
| lusup = Lnzval_bc_ptr[lk]; |
| nsupr = lsub[1]; |
| #ifdef _CRAY |
| STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, |
| lusup, &nsupr, &x[ii], &knsupc); |
| #else |
| dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, |
| lusup, &nsupr, &x[ii], &knsupc); |
| #endif |
| stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; |
| --nleaf; |
| #if ( DEBUGlevel>=2 ) |
| printf("(%2d) Solve X[%2d]\n", iam, k); |
| #endif |
| /* |
| * Send Xk to process column Pc[k]. |
| */ |
| for (p = 0; p < Pr; ++p) |
| if ( fsendx_plist[lk][p] != EMPTY ) { |
| pi = PNUM( p, kcol, grid ); |
| #ifdef ISEND_IRECV |
| #if 1 |
| MPI_Test( &send_req[p], &test_flag, &status ); |
| #else |
| if ( send_req[p] != MPI_REQUEST_NULL ) |
| MPI_Wait( &send_req[p], &status ); |
| #endif |
| MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, |
| MPI_DOUBLE, pi, Xk, grid->comm, &send_req[p]); |
| #else |
| #ifdef BSEND |
| MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, |
| MPI_DOUBLE, pi, Xk, grid->comm ); |
| #else |
| MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, |
| MPI_DOUBLE, pi, Xk, grid->comm ); |
| #endif |
| #endif |
| #if ( DEBUGlevel>=2 ) |
| printf("(%2d) Sent X[%2.0f] to P %2d\n", |
| iam, x[ii-XK_H], pi); |
| #endif |
| } |
| /* |
| * Perform local block modifications: lsum[i] -= L_i,k * X[k] |
| */ |
| nb = lsub[0] - 1; |
| lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; |
| luptr = knsupc; /* Skip diagonal block L(k,k). */ |
| dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, |
| fmod, nb, lptr, luptr, xsup, grid, Llu, |
| send_req,stat); |
| #ifdef ISEND_IRECV |
| /* Wait for previous Isends to complete. */ |
| for (p = 0; p < Pr; ++p) { |
| if ( fsendx_plist[lk][p] != EMPTY ) |
| /*MPI_Wait( &send_req[p], &status );*/ |
| MPI_Test( &send_req[p], &test_flag, &status ); |
| } |
| #endif |
| } |
| } /* if diagonal process ... */ |
| } /* for k ... */ |
| /* ----------------------------------------------------------- |
| Compute the internal nodes asynchronously by all processes. |
| ----------------------------------------------------------- */ |
| #if ( DEBUGlevel>=1 ) |
| printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", |
| iam, nfrecvx, nfrecvmod, nleaf); |
| #endif |
| while ( nfrecvx || nfrecvmod ) { /* While not finished. */ |
| /* Receive a message. */ |
| #ifdef ISEND_IRECV |
| /* -MPI- FATAL: Remote protocol queue full */ |
| MPI_Irecv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, |
| MPI_ANY_TAG, grid->comm, &recv_req ); |
| MPI_Wait( &recv_req, &status ); |
| #else |
| MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, |
| MPI_ANY_TAG, grid->comm, &status ); |
| #endif |
| k = *recvbuf; |
| #if ( DEBUGlevel>=2 ) |
| printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); |
| #endif |
| switch ( status.MPI_TAG ) { |
| case Xk: |
| --nfrecvx; |
| lk = LBj( k, grid ); /* Local block number, column-wise. */ |
| lsub = Lrowind_bc_ptr[lk]; |
| lusup = Lnzval_bc_ptr[lk]; |
| if ( lsub ) { |
| nb = lsub[0]; |
| lptr = BC_HEADER; |
| luptr = 0; |
| knsupc = SuperSize( k ); |
| /* |
| * Perform local block modifications: lsum[i] -= L_i,k * X[k] |
| */ |
| dlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, |
| fmod, nb, lptr, luptr, xsup, grid, Llu, |
| send_req, stat); |
| } /* if lsub */ |
| break; |
| case LSUM: |
| --nfrecvmod; |
| lk = LBi( k, grid ); /* Local block number, row-wise. */ |
| ii = X_BLK( lk ); |
| knsupc = SuperSize( k ); |
| tempv = &recvbuf[LSUM_H]; |
| RHS_ITERATE(j) |
| for (i = 0; i < knsupc; ++i) |
| x[i + ii + j*knsupc] += tempv[i + j*knsupc]; |
| if ( (--frecv[lk])==0 && fmod[lk]==0 ) { |
| fmod[lk] = -1; /* Do not solve X[k] in the future. */ |
| lk = LBj( k, grid ); /* Local block number, column-wise. */ |
| lsub = Lrowind_bc_ptr[lk]; |
| lusup = Lnzval_bc_ptr[lk]; |
| nsupr = lsub[1]; |
| #ifdef _CRAY |
| STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, |
| lusup, &nsupr, &x[ii], &knsupc); |
| #else |
| dtrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, |
| lusup, &nsupr, &x[ii], &knsupc); |
| #endif |
| stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; |
| #if ( DEBUGlevel>=2 ) |
| printf("(%2d) Solve X[%2d]\n", iam, k); |
| #endif |
| /* |
| * Send Xk to process column Pc[k]. |
| */ |
| kcol = PCOL( k, grid ); |
| for (p = 0; p < Pr; ++p) |
| if ( fsendx_plist[lk][p] != EMPTY ) { |
| pi = PNUM( p, kcol, grid ); |
| #ifdef ISEND_IRECV |
| #if 1 |
| MPI_Test( &send_req[p], &test_flag, &status ); |
| #else |
| if ( send_req[p] != MPI_REQUEST_NULL ) |
| MPI_Wait( &send_req[p], &status ); |
| #endif |
| MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, |
| MPI_DOUBLE, pi, Xk, grid->comm, |
| &send_req[p]); |
| #else |
| #ifdef BSEND |
| MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, |
| MPI_DOUBLE, pi, Xk, grid->comm ); |
| #else |
| MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, |
| MPI_DOUBLE, pi, Xk, grid->comm ); |
| #endif |
| #endif |
| #if ( DEBUGlevel>=2 ) |
| printf("(%2d) Sent X[%2.0f] to P %2d\n", |
| iam, x[ii-XK_H], pi); |
| #endif |
| } |
| /* |
| * Perform local block modifications. |
| */ |
| nb = lsub[0] - 1; |
| lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; |
| luptr = knsupc; /* Skip diagonal block L(k,k). */ |
| dlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, |
| fmod, nb, lptr, luptr, xsup, grid, Llu, |
| send_req, stat); |
| #ifdef ISEND_IRECV |
| /* Wait for the previous Isends to complete. */ |
| for (p = 0; p < Pr; ++p) { |
| if ( fsendx_plist[lk][p] != EMPTY ) |
| MPI_Test( &send_req[p], &test_flag, &status ); |
| } |
| #endif |
| } /* if */ |
| break; |
| #if ( DEBUGlevel>=1 ) |
| default: |
| printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); |
| break; |
| #endif |
| } /* switch */ |
| } /* while not finished ... */ |
| #if ( PRNTlevel>=2 ) |
| t = SuperLU_timer_() - t; |
| if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); |
| t = SuperLU_timer_(); |
| #endif |
| #if ( PRNTlevel==2 ) |
| if ( !iam ) printf("\n.. After L-solve: y =\n"); |
| for (i = 0, k = 0; k < nsupers; ++k) { |
| krow = PROW( k, grid ); |
| kcol = PCOL( k, grid ); |
| if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ |
| knsupc = SuperSize( k ); |
| lk = LBi( k, grid ); |
| ii = X_BLK( lk ); |
| for (j = 0; j < knsupc; ++j) |
| printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); |
| } |
| MPI_Barrier( grid->comm ); |
| } |
| #endif |
| SUPERLU_FREE(fmod); |
| SUPERLU_FREE(frecv); |
| SUPERLU_FREE(rtemp); |
| /* MPI_Barrier( grid->comm ); Drain messages in the forward solve. */ |
| /*--------------------------------------------------- |
| * Back solve Ux = y. |
| * |
| * The Y components from the forward solve is already |
| * on the diagonal processes. |
| *---------------------------------------------------*/ |
| /* Save the count to be altered so it can be used by |
| subsequent call to PDGSTRS_BGLOBAL. */ |
| if ( !(bmod = intMalloc_dist(nlb)) ) |
| ABORT("Calloc fails for bmod[]."); |
| for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; |
| if ( !(brecv = intMalloc_dist(nlb)) ) |
| ABORT("Malloc fails for brecv[]."); |
| Llu->brecv = brecv; |
| /* |
| * Compute brecv[] and nbrecvmod counts on the diagonal processes. |
| */ |
| { |
| superlu_scope_t *scp = &grid->rscp; |
| for (k = 0; k < nsupers; ++k) { |
| krow = PROW( k, grid ); |
| if ( myrow == krow ) { |
| lk = LBi( k, grid ); /* Local block number. */ |
| kcol = PCOL( k, grid ); /* Root process in this row scope. */ |
| if ( mycol != kcol && bmod[lk] ) |
| i = 1; /* Contribution from non-diagonal process. */ |
| else i = 0; |
| MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, |
| MPI_SUM, kcol, scp->comm ); |
| if ( mycol == kcol ) { /* Diagonal process. */ |
| nbrecvmod += brecv[lk]; |
| if ( !brecv[lk] && !bmod[lk] ) ++nroot; |
| #if ( DEBUGlevel>=2 ) |
| printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); |
| assert( brecv[lk] < Pc ); |
| #endif |
| } |
| } |
| } |
| } |
| /* Re-initialize lsum to zero. Each block header is already in place. */ |
| for (k = 0; k < nsupers; ++k) { |
| krow = PROW( k, grid ); |
| if ( myrow == krow ) { |
| knsupc = SuperSize( k ); |
| lk = LBi( k, grid ); |
| il = LSUM_BLK( lk ); |
| dest = &lsum[il]; |
| RHS_ITERATE(j) |
| for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0; |
| } |
| } |
| /* Set up additional pointers for the index and value arrays of U. |
| nlb is the number of local block rows. */ |
| nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ |
| if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) |
| ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero |
| blocks in a block column. */ |
| Urbs1 = Urbs + nub; |
| if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) |
| ABORT("Malloc fails for Ucb_indptr[]"); |
| if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) |
| ABORT("Malloc fails for Ucb_valptr[]"); |
| /* Count number of row blocks in a block column. |
| One pass of the skeleton graph of U. */ |
| for (lk = 0; lk < nlb; ++lk) { |
| usub = Ufstnz_br_ptr[lk]; |
| if ( usub ) { /* Not an empty block row. */ |
| /* usub[0] -- number of column blocks in this block row. */ |
| #if ( DEBUGlevel>=2 ) |
| Ublocks += usub[0]; |
| #endif |
| i = BR_HEADER; /* Pointer in index array. */ |
| for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ |
| k = usub[i]; /* Global block number */ |
| ++Urbs[LBj(k,grid)]; |
| i += UB_DESCRIPTOR + SuperSize( k ); |
| } |
| } |
| } |
| /* Set up the vertical linked lists for the row blocks. |
| One pass of the skeleton graph of U. */ |
| for (lb = 0; lb < nub; ++lb) |
| if ( Urbs[lb] ) { /* Not an empty block column. */ |
| if ( !(Ucb_indptr[lb] |
| = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) |
| ABORT("Malloc fails for Ucb_indptr[lb][]"); |
| if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) |
| ABORT("Malloc fails for Ucb_valptr[lb][]"); |
| } |
| for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ |
| usub = Ufstnz_br_ptr[lk]; |
| if ( usub ) { /* Not an empty block row. */ |
| i = BR_HEADER; /* Pointer in index array. */ |
| j = 0; /* Pointer in nzval array. */ |
| for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ |
| k = usub[i]; /* Global block number, column-wise. */ |
| ljb = LBj( k, grid ); /* Local block number, column-wise. */ |
| Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; |
| Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; |
| Ucb_valptr[ljb][Urbs1[ljb]] = j; |
| ++Urbs1[ljb]; |
| j += usub[i+1]; |
| i += UB_DESCRIPTOR + SuperSize( k ); |
| } |
| } |
| } |
| #if ( DEBUGlevel>=2 ) |
| for (p = 0; p < Pr*Pc; ++p) { |
| if (iam == p) { |
| printf("(%2d) .. Ublocks %d\n", iam, Ublocks); |
| for (lb = 0; lb < nub; ++lb) { |
| printf("(%2d) Local col %2d: # row blocks %2d\n", |
| iam, lb, Urbs[lb]); |
| if ( Urbs[lb] ) { |
| for (i = 0; i < Urbs[lb]; ++i) |
| printf("(%2d) .. row blk %2d:\ |
| lbnum %d, indpos %d, valpos %d\n", |
| iam, i, |
| Ucb_indptr[lb][i].lbnum, |
| Ucb_indptr[lb][i].indpos, |
| Ucb_valptr[lb][i]); |
| } |
| } |
| } |
| MPI_Barrier( grid->comm ); |
| } |
| for (p = 0; p < Pr*Pc; ++p) { |
| if ( iam == p ) { |
| printf("\n(%d) bsendx_plist[][]", iam); |
| for (lb = 0; lb < nub; ++lb) { |
| printf("\n(%d) .. local col %2d: ", iam, lb); |
| for (i = 0; i < Pr; ++i) |
| printf("%4d", bsendx_plist[lb][i]); |
| } |
| printf("\n"); |
| } |
| MPI_Barrier( grid->comm ); |
| } |
| #endif /* DEBUGlevel */ |
| #if ( PRNTlevel>=3 ) |
| t = SuperLU_timer_() - t; |
| if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t); |
| t = SuperLU_timer_(); |
| #endif |
| /* |
| * Solve the roots first by all the diagonal processes. |
| */ |
| #if ( DEBUGlevel>=1 ) |
| printf("(%2d) nroot %4d\n", iam, nroot); |
| #endif |
| for (k = nsupers-1; k >= 0 && nroot; --k) { |
| krow = PROW( k, grid ); |
| kcol = PCOL( k, grid ); |
| if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */ |
| knsupc = SuperSize( k ); |
| lk = LBi( k, grid ); /* Local block number, row-wise. */ |
| if ( brecv[lk]==0 && bmod[lk]==0 ) { |
| bmod[lk] = -1; /* Do not solve X[k] in the future. */ |
| ii = X_BLK( lk ); |
| lk = LBj( k, grid ); /* Local block number, column-wise */ |
| lsub = Lrowind_bc_ptr[lk]; |
| lusup = Lnzval_bc_ptr[lk]; |
| nsupr = lsub[1]; |
| #ifdef _CRAY |
| STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, |
| lusup, &nsupr, &x[ii], &knsupc); |
| #else |
| dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, |
| lusup, &nsupr, &x[ii], &knsupc); |
| #endif |
| stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; |
| --nroot; |
| #if ( DEBUGlevel>=2 ) |
| printf("(%2d) Solve X[%2d]\n", iam, k); |
| #endif |
| /* |
| * Send Xk to process column Pc[k]. |
| */ |
| for (p = 0; p < Pr; ++p) |
| if ( bsendx_plist[lk][p] != EMPTY ) { |
| pi = PNUM( p, kcol, grid ); |
| #ifdef ISEND_IRECV |
| #if 1 |
| MPI_Test( &send_req[p], &test_flag, &status ); |
| #else |
| if ( send_req[p] != MPI_REQUEST_NULL ) |
| MPI_Wait( &send_req[p], &status ); |
| #endif |
| MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, |
| MPI_DOUBLE, pi, Xk, grid->comm, &send_req[p]); |
| #else |
| #ifdef BSEND |
| MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, |
| MPI_DOUBLE, pi, Xk, grid->comm ); |
| #else |
| MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, |
| MPI_DOUBLE, pi, Xk, grid->comm ); |
| #endif |
| #endif |
| #if ( DEBUGlevel>=2 ) |
| printf("(%2d) Sent X[%2.0f] to P %2d\n", |
| iam, x[ii-XK_H], pi); |
| #endif |
| } |
| /* |
| * Perform local block modifications: lsum[i] -= U_i,k * X[k] |
| */ |
| if ( Urbs[lk] ) |
| dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, |
| Ucb_indptr, Ucb_valptr, xsup, grid, Llu, |
| send_req, stat); |
| #ifdef ISEND_IRECV |
| /* Wait for the previous Isends to complete. */ |
| for (p = 0; p < Pr; ++p) { |
| if ( bsendx_plist[lk][p] != EMPTY ) |
| /*MPI_Wait( &send_req[p], &status );*/ |
| MPI_Test( &send_req[p], &test_flag, &status ); |
| } |
| #endif |
| } /* if root ... */ |
| } /* if diagonal process ... */ |
| } /* for k ... */ |
| /* |
| * Compute the internal nodes asychronously by all processes. |
| */ |
| while ( nbrecvx || nbrecvmod ) { /* While not finished. */ |
| /* Receive a message. */ |
| MPI_Recv( recvbuf, maxrecvsz, MPI_DOUBLE, MPI_ANY_SOURCE, |
| MPI_ANY_TAG, grid->comm, &status ); |
| k = *recvbuf; |
| #if ( DEBUGlevel>=2 ) |
| printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); |
| #endif |
| switch ( status.MPI_TAG ) { |
| case Xk: |
| --nbrecvx; |
| lk = LBj( k, grid ); /* Local block number, column-wise. */ |
| /* |
| * Perform local block modifications: |
| * lsum[i] -= U_i,k * X[k] |
| */ |
| dlsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs, |
| Ucb_indptr, Ucb_valptr, xsup, grid, Llu, |
| send_req, stat); |
| break; |
| case LSUM: |
| --nbrecvmod; |
| lk = LBi( k, grid ); /* Local block number, row-wise. */ |
| ii = X_BLK( lk ); |
| knsupc = SuperSize( k ); |
| tempv = &recvbuf[LSUM_H]; |
| RHS_ITERATE(j) |
| for (i = 0; i < knsupc; ++i) |
| x[i + ii + j*knsupc] += tempv[i + j*knsupc]; |
| if ( (--brecv[lk])==0 && bmod[lk]==0 ) { |
| bmod[lk] = -1; /* Do not solve X[k] in the future. */ |
| lk = LBj( k, grid ); /* Local block number, column-wise. */ |
| lsub = Lrowind_bc_ptr[lk]; |
| lusup = Lnzval_bc_ptr[lk]; |
| nsupr = lsub[1]; |
| #ifdef _CRAY |
| STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, |
| lusup, &nsupr, &x[ii], &knsupc); |
| #else |
| dtrsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, |
| lusup, &nsupr, &x[ii], &knsupc); |
| #endif |
| stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; |
| #if ( DEBUGlevel>=2 ) |
| printf("(%2d) Solve X[%2d]\n", iam, k); |
| #endif |
| /* |
| * Send Xk to process column Pc[k]. |
| */ |
| kcol = PCOL( k, grid ); |
| for (p = 0; p < Pr; ++p) |
| if ( bsendx_plist[lk][p] != EMPTY ) { |
| pi = PNUM( p, kcol, grid ); |
| #ifdef ISEND_IRECV |
| #if 1 |
| MPI_Test( &send_req[p], &test_flag, &status ); |
| #else |
| if ( send_req[p] != MPI_REQUEST_NULL ) |
| MPI_Wait( &send_req[p], &status ); |
| #endif |
| MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, |
| MPI_DOUBLE, pi, Xk, grid->comm, |
| &send_req[p] ); |
| #else |
| #ifdef BSEND |
| MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, |
| MPI_DOUBLE, pi, Xk, grid->comm ); |
| #else |
| MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, |
| MPI_DOUBLE, pi, Xk, grid->comm ); |
| #endif |
| #endif |
| #if ( DEBUGlevel>=2 ) |
| printf("(%2d) Sent X[%2.0f] to P %2d\n", |
| iam, x[ii - XK_H], pi); |
| #endif |
| } |
| /* |
| * Perform local block modifications: |
| * lsum[i] -= U_i,k * X[k] |
| */ |
| if ( Urbs[lk] ) |
| dlsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, |
| Ucb_indptr, Ucb_valptr, xsup, grid, Llu, |
| send_req, stat); |
| #ifdef ISEND_IRECV |
| /* Wait for the previous Isends to complete. */ |
| for (p = 0; p < Pr; ++p) { |
| if ( bsendx_plist[lk][p] != EMPTY ) |
| /*MPI_Wait( &send_req[p], &status );*/ |
| MPI_Test( &send_req[p], &test_flag, &status ); |
| } |
| #endif |
| } /* if becomes solvable */ |
| break; |
| #if ( DEBUGlevel>=1 ) |
| default: |
| printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); |
| break; |
| #endif |
| } /* switch */ |
| } /* while not finished ... */ |
| #if ( PRNTlevel>=3 ) |
| t = SuperLU_timer_() - t; |
| if ( !iam ) printf(".. U-solve time\t%8.2f\n", t); |
| #endif |
| /* Copy the solution X into B (on all processes). */ |
| { |
| int_t num_diag_procs, *diag_procs, *diag_len; |
| double *work; |
| get_diag_procs(n, Glu_persist, grid, &num_diag_procs, |
| &diag_procs, &diag_len); |
| jj = diag_len[0]; |
| for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]); |
| if ( !(work = doubleMalloc_dist(jj*nrhs)) ) |
| ABORT("Malloc fails for work[]"); |
| gather_diag_to_all(n, nrhs, x, Glu_persist, Llu, |
| grid, num_diag_procs, diag_procs, diag_len, |
| B, ldb, work); |
| SUPERLU_FREE(diag_procs); |
| SUPERLU_FREE(diag_len); |
| SUPERLU_FREE(work); |
| } |
| /* Deallocate storage. */ |
| SUPERLU_FREE(lsum); |
| SUPERLU_FREE(x); |
| SUPERLU_FREE(recvbuf); |
| for (i = 0; i < nub; ++i) |
| if ( Urbs[i] ) { |
| SUPERLU_FREE(Ucb_indptr[i]); |
| SUPERLU_FREE(Ucb_valptr[i]); |
| } |
| SUPERLU_FREE(Ucb_indptr); |
| SUPERLU_FREE(Ucb_valptr); |
| SUPERLU_FREE(Urbs); |
| SUPERLU_FREE(bmod); |
| SUPERLU_FREE(brecv); |
| #ifdef ISEND_IRECV |
| for (p = 0; p < Pr; ++p) { |
| if ( send_req[p] != MPI_REQUEST_NULL ) |
| MPI_Wait( &send_req[p], &status ); |
| } |
| SUPERLU_FREE(send_req); |
| #endif |
| stat->utime[SOLVE] = SuperLU_timer_() - t; |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(iam, "Exit pdgstrs_Bglobal()"); |
| #endif |
| /* Chao debug */ |
| MPI_Barrier( grid->comm ); /* Drain messages in the forward solve. */ |
| } /* PDGSTRS_BGLOBAL */ |
| /*! \brief |
| * |
| * <pre> |
| * Gather the components of x vector on the diagonal processes |
| * onto all processes, and combine them into the global vector y. |
| * </pre> |
| */ |
| static void |
| gather_diag_to_all(int_t n, int_t nrhs, double x[], |
| Glu_persist_t *Glu_persist, LocalLU_t *Llu, |
| gridinfo_t *grid, int_t num_diag_procs, |
| int_t diag_procs[], int_t diag_len[], |
| double y[], int_t ldy, double work[]) |
| { |
| int_t i, ii, j, k, lk, lwork, nsupers, p; |
| int_t *ilsum, *xsup; |
| int iam, knsupc, pkk; |
| double *x_col, *y_col; |
| iam = grid->iam; |
| nsupers = Glu_persist->supno[n-1] + 1; |
| xsup = Glu_persist->xsup; |
| ilsum = Llu->ilsum; |
| for (p = 0; p < num_diag_procs; ++p) { |
| pkk = diag_procs[p]; |
| if ( iam == pkk ) { |
| /* Copy x vector into a buffer. */ |
| lwork = 0; |
| for (k = p; k < nsupers; k += num_diag_procs) { |
| knsupc = SuperSize( k ); |
| lk = LBi( k, grid ); |
| ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/ |
| x_col = &x[ii]; |
| for (j = 0; j < nrhs; ++j) { |
| for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i]; |
| lwork += knsupc; |
| x_col += knsupc; |
| } |
| } |
| MPI_Bcast( work, lwork, MPI_DOUBLE, pkk, grid->comm ); |
| } else { |
| MPI_Bcast( work, diag_len[p]*nrhs, MPI_DOUBLE, pkk, grid->comm ); |
| } |
| /* Scatter work[] into global y vector. */ |
| lwork = 0; |
| for (k = p; k < nsupers; k += num_diag_procs) { |
| knsupc = SuperSize( k ); |
| ii = FstBlockC( k ); |
| y_col = &y[ii]; |
| for (j = 0; j < nrhs; ++j) { |
| for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork]; |
| lwork += knsupc; |
| y_col += ldy; |
| } |
| } |
| } |
| } /* GATHER_DIAG_TO_ALL */ |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/temp |
|---|
| New file |
| 0,0 → 1,296 |
| 1,4c1,2 |
| < /*! @file |
| < * \brief Implements parallel symbolic factorization |
| < * |
| < * <pre> |
| --- |
| > |
| > /* |
| 12c10,13 |
| < * |
| --- |
| > */ |
| > |
| > |
| > /* |
| 19c20 |
| < * </pre> |
| --- |
| > * |
| 22,23d22 |
| < |
| < |
| 130,132c129,147 |
| < /*! \brief |
| < * |
| < * <pre> |
| --- |
| > float symbfact_dist |
| > /************************************************************************/ |
| > ( |
| > int nprocs_num, /* Input - no of processors */ |
| > int nprocs_symb, /* Input - no of processors for the symbolic |
| > factorization */ |
| > SuperMatrix *A, /* Input - distributed input matrix */ |
| > int_t *perm_c, /* Input - column permutation */ |
| > int_t *perm_r, /* Input - row permutation */ |
| > int_t *sizes, /* Input - sizes of each node in the separator tree */ |
| > int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ |
| > Pslu_freeable_t *Pslu_freeable, /* Output - local L and U structure, |
| > global to local indexing information */ |
| > MPI_Comm *num_comm, /* Input - communicator for numerical factorization */ |
| > MPI_Comm *symb_comm, /* Input - communicator for symbolic factorization */ |
| > mem_usage_t *symb_mem_usage |
| > ) |
| > { |
| > /* |
| 214c229 |
| < * </pre> |
| --- |
| > * |
| 216,234d230 |
| < float symbfact_dist |
| < /************************************************************************/ |
| < ( |
| < int nprocs_num, /* Input - no of processors */ |
| < int nprocs_symb, /* Input - no of processors for the symbolic |
| < factorization */ |
| < SuperMatrix *A, /* Input - distributed input matrix */ |
| < int_t *perm_c, /* Input - column permutation */ |
| < int_t *perm_r, /* Input - row permutation */ |
| < int_t *sizes, /* Input - sizes of each node in the separator tree */ |
| < int_t *fstVtxSep, /* Input - first vertex of each node in the tree */ |
| < Pslu_freeable_t *Pslu_freeable, /* Output - local L and U structure, |
| < global to local indexing information */ |
| < MPI_Comm *num_comm, /* Input - communicator for numerical factorization */ |
| < MPI_Comm *symb_comm, /* Input - communicator for symbolic factorization */ |
| < mem_usage_t *symb_mem_usage |
| < ) |
| < { |
| < |
| 756,762c752 |
| < /*! \brief |
| < * <pre> |
| < * Purpose |
| < * ======= |
| < * Initialize relaxation parameters and statistics variables |
| < * </pre> |
| < */ |
| --- |
| > |
| 768c758,762 |
| < |
| --- |
| > /* |
| > * Purpose |
| > * ======= |
| > * Initialize relaxation parameters and statistics variables |
| > */ |
| 804,814d797 |
| < /*! \brief |
| < * |
| < * <pre> |
| < * Purpose |
| < * ======= |
| < * |
| < * Computes an estimation of the number of elements in columns of L |
| < * and rows of U. Stores this information in cntelt_vtcs, and it will |
| < * be used in the right-looking symbolic factorization. |
| < * </pre> |
| < */ |
| 830c813,820 |
| < |
| --- |
| > /* |
| > * Purpose |
| > * ======= |
| > * |
| > * Computes an estimation of the number of elements in columns of L |
| > * and rows of U. Stores this information in cntelt_vtcs, and it will |
| > * be used in the right-looking symbolic factorization. |
| > */ |
| 960,962c950,967 |
| < /*! \brief |
| < * |
| < * <pre> |
| --- |
| > static float |
| > symbfact_mapVtcs |
| > ( |
| > int iam, /* Input -process number */ |
| > int nprocs_num, /* Input -number of processors */ |
| > int nprocs_symb, /* Input -number of procs for symbolic factorization */ |
| > SuperMatrix *A, /* Input -input distributed matrix A */ |
| > int_t *fstVtxSep, /* Input -first vertex in each separator */ |
| > int_t *sizes, /* Input -size of each separator in the separator tree */ |
| > Pslu_freeable_t *Pslu_freeable, /* Output -globToLoc and maxNvtcsPProc |
| > computed */ |
| > vtcsInfo_symbfact_t *VInfo, /* Output -local info on vertices distribution */ |
| > int_t *tempArray, /* Input -temp array of size n = order of the matrix */ |
| > int_t maxSzBlk, /* Input -maximum number of vertices in a block */ |
| > psymbfact_stat_t *PS /* Input/Output -statistics */ |
| > ) |
| > { |
| > /* |
| 985c990 |
| < * </pre> |
| --- |
| > * |
| 987,1004d991 |
| < static float |
| < symbfact_mapVtcs |
| < ( |
| < int iam, /* Input -process number */ |
| < int nprocs_num, /* Input -number of processors */ |
| < int nprocs_symb, /* Input -number of procs for symbolic factorization */ |
| < SuperMatrix *A, /* Input -input distributed matrix A */ |
| < int_t *fstVtxSep, /* Input -first vertex in each separator */ |
| < int_t *sizes, /* Input -size of each separator in the separator tree */ |
| < Pslu_freeable_t *Pslu_freeable, /* Output -globToLoc and maxNvtcsPProc |
| < computed */ |
| < vtcsInfo_symbfact_t *VInfo, /* Output -local info on vertices distribution */ |
| < int_t *tempArray, /* Input -temp array of size n = order of the matrix */ |
| < int_t maxSzBlk, /* Input -maximum number of vertices in a block */ |
| < psymbfact_stat_t *PS /* Input/Output -statistics */ |
| < ) |
| < { |
| < |
| 1199,1210d1185 |
| < /*! \brief |
| < * |
| < * <pre> |
| < * Purpose |
| < * ======= |
| < * |
| < * Distribute input matrix A for the symbolic factorization routine. |
| < * Only structural information is distributed. The redistributed |
| < * matrix has its rows and columns permuted according to perm_r and |
| < * perm_c. A is not modified during this routine. |
| < * </pre> |
| < */ |
| 1231c1206,1215 |
| < |
| --- |
| > /* |
| > * Purpose |
| > * ======= |
| > * |
| > * Distribute input matrix A for the symbolic factorization routine. |
| > * Only structural information is distributed. The redistributed |
| > * matrix has its rows and columns permuted according to perm_r and |
| > * perm_c. A is not modified during this routine. |
| > * |
| > */ |
| 1466a1451,1452 |
| > intBuf1[iam]=0; /* This corresponds to nnzToSend[iam] */ |
| > intBuf3[iam]=0; /* This corresponds to nnzToRecv[iam] */ |
| 1469a1456,1458 |
| > i = nnzToRecv[iam]; |
| > nnzToRecv[iam] = 0; |
| > nnzToSend[iam] = 0; |
| 1472,1475d1460 |
| < i = nnzToRecv[iam]; /* This corresponds to nnzToRecv[iam] */ |
| < intBuf3[iam] = 0; |
| < intBuf1[iam] = 0; |
| < |
| 1478a1464 |
| > |
| 1481c1467 |
| < #endif |
| --- |
| > #else /* Default */ |
| 1482a1469 |
| > #endif |
| 1603,1613d1589 |
| < /*! \brief |
| < * |
| < * <pre> |
| < * Allocate storage for data structures necessary for pruned graphs. |
| < * For those unpredictable size, make a guess as FILL * n. |
| < * Return value: |
| < * 0 if enough memory was available; |
| < * otherwise, return the amount of space intended to allocate |
| < * when memory allocation failure occurred. |
| < * </pre> |
| < */ |
| 1623c1599,1606 |
| < |
| --- |
| > /* |
| > * Allocate storage for data structures necessary for pruned graphs. |
| > * For those unpredictable size, make a guess as FILL * n. |
| > * Return value: |
| > * 0 if enough memory was available; |
| > * otherwise, return the amount of space intended to allocate |
| > * when memory allocation failure occurred. |
| > */ |
| 1706,1716d1688 |
| < /*! \brief |
| < * |
| < * <pre> |
| < * Allocate storage for data structures necessary for pruned graphs. |
| < * For those unpredictable size, make a guess as FILL * n. |
| < * Return value: |
| < * 0 if enough memory was available; |
| < * otherwise, return the amount of space intended to allocate |
| < * when memory allocation failure occurred. |
| < * </pre> |
| < */ |
| 1728c1700,1707 |
| < |
| --- |
| > /* |
| > * Allocate storage for data structures necessary for pruned graphs. |
| > * For those unpredictable size, make a guess as FILL * n. |
| > * Return value: |
| > * 0 if enough memory was available; |
| > * otherwise, return the amount of space intended to allocate |
| > * when memory allocation failure occurred. |
| > */ |
| 1796,1806d1774 |
| < /*! \brief |
| < * |
| < * <pre> |
| < * Allocate storage for the data structures common to symbolic factorization |
| < * routines. For those unpredictable size, make a guess as FILL * nnz(A). |
| < * Return value: |
| < * 0 if enough memory was available; |
| < * otherwise, return the amount of space intended to allocate |
| < * when memory allocation failure occurred. |
| < * </pre> |
| < */ |
| 1821c1789,1796 |
| < |
| --- |
| > /* |
| > * Allocate storage for the data structures common to symbolic factorization |
| > * routines. For those unpredictable size, make a guess as FILL * nnz(A). |
| > * Return value: |
| > * 0 if enough memory was available; |
| > * otherwise, return the amount of space intended to allocate |
| > * when memory allocation failure occurred. |
| > */ |
| 1990d1964 |
| < |
| 2846,2848c2820 |
| < /*! \brief |
| < * |
| < * <pre> |
| --- |
| > /* |
| 2854d2825 |
| < * </pre> |
| 4013,4016c3984 |
| < /*! \brief |
| < |
| < <pre> |
| < all processors affected to current node must call this routine |
| --- |
| > /* all processors affected to current node must call this routine |
| 4019,4021c3987,3988 |
| < MPI_allreduce among all processors affected to curent node |
| < </pre> |
| < */ |
| --- |
| > MPI_allreduce among all processors affected to curent node */ |
| > |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/memory.patch |
|---|
| New file |
| 0,0 → 1,10 |
| 118d117 |
| < |
| 144c143 |
| < buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t)); |
| --- |
| > buf = (int_t *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(int_t)); |
| 152c151 |
| < buf = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t)); |
| --- |
| > buf = (int_t *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(int_t)); |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/dgsequ.c |
|---|
| New file |
| 0,0 → 1,184 |
| /*! @file |
| * \brief Computes row and column scalings |
| */ |
| /* |
| * File name: dgsequ.c |
| * History: Modified from LAPACK routine DGEEQU |
| */ |
| #include <math.h> |
| #include "superlu_ddefs.h" |
| /*! \brief |
| <pre> |
| Purpose |
| ======= |
| DGSEQU_dist computes row and column scalings intended to equilibrate an |
| M-by-N sparse matrix A and reduce its condition number. R returns the row |
| scale factors and C the column scale factors, chosen to try to make |
| the largest element in each row and column of the matrix B with |
| elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1. |
| R(i) and C(j) are restricted to be between SMLNUM = smallest safe |
| number and BIGNUM = largest safe number. Use of these scaling |
| factors is not guaranteed to reduce the condition number of A but |
| works well in practice. |
| See supermatrix.h for the definition of 'SuperMatrix' structure. |
| Arguments |
| ========= |
| A (input) SuperMatrix* |
| The matrix of dimension (A->nrow, A->ncol) whose equilibration |
| factors are to be computed. The type of A can be: |
| Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. |
| R (output) double*, size A->nrow |
| If INFO = 0 or INFO > M, R contains the row scale factors |
| for A. |
| C (output) double*, size A->ncol |
| If INFO = 0, C contains the column scale factors for A. |
| ROWCND (output) double* |
| If INFO = 0 or INFO > M, ROWCND contains the ratio of the |
| smallest R(i) to the largest R(i). If ROWCND >= 0.1 and |
| AMAX is neither too large nor too small, it is not worth |
| scaling by R. |
| COLCND (output) double* |
| If INFO = 0, COLCND contains the ratio of the smallest |
| C(i) to the largest C(i). If COLCND >= 0.1, it is not |
| worth scaling by C. |
| AMAX (output) double* |
| Absolute value of largest matrix element. If AMAX is very |
| close to overflow or very close to underflow, the matrix |
| should be scaled. |
| INFO (output) int* |
| = 0: successful exit |
| < 0: if INFO = -i, the i-th argument had an illegal value |
| > 0: if INFO = i, and i is |
| <= M: the i-th row of A is exactly zero |
| > M: the (i-M)-th column of A is exactly zero |
| ===================================================================== |
| </pre> |
| */ |
| void |
| dgsequ_dist(SuperMatrix *A, double *r, double *c, double *rowcnd, |
| double *colcnd, double *amax, int_t *info) |
| { |
| /* Local variables */ |
| NCformat *Astore; |
| double *Aval; |
| int i, j, irow; |
| double rcmin, rcmax; |
| double bignum, smlnum; |
| extern double dlamch_(char *); |
| /* Test the input parameters. */ |
| *info = 0; |
| if ( A->nrow < 0 || A->ncol < 0 || |
| A->Stype != SLU_NC || A->Dtype != SLU_D || A->Mtype != SLU_GE ) |
| *info = -1; |
| if (*info != 0) { |
| i = -(*info); |
| xerbla_("dgsequ_dist", &i); |
| return; |
| } |
| /* Quick return if possible */ |
| if ( A->nrow == 0 || A->ncol == 0 ) { |
| *rowcnd = 1.; |
| *colcnd = 1.; |
| *amax = 0.; |
| return; |
| } |
| Astore = (NCformat *) A->Store; |
| Aval = (double *) Astore->nzval; |
| /* Get machine constants. */ |
| smlnum = dlamch_("S"); |
| bignum = 1. / smlnum; |
| /* Compute row scale factors. */ |
| for (i = 0; i < A->nrow; ++i) r[i] = 0.; |
| /* Find the maximum element in each row. */ |
| for (j = 0; j < A->ncol; ++j) |
| for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { |
| irow = Astore->rowind[i]; |
| r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) ); |
| } |
| /* Find the maximum and minimum scale factors. */ |
| rcmin = bignum; |
| rcmax = 0.; |
| for (i = 0; i < A->nrow; ++i) { |
| rcmax = SUPERLU_MAX(rcmax, r[i]); |
| rcmin = SUPERLU_MIN(rcmin, r[i]); |
| } |
| *amax = rcmax; |
| if (rcmin == 0.) { |
| /* Find the first zero scale factor and return an error code. */ |
| for (i = 0; i < A->nrow; ++i) |
| if (r[i] == 0.) { |
| *info = i + 1; |
| return; |
| } |
| } else { |
| /* Invert the scale factors. */ |
| for (i = 0; i < A->nrow; ++i) |
| r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); |
| /* Compute ROWCND = min(R(I)) / max(R(I)) */ |
| *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); |
| } |
| /* Compute column scale factors */ |
| for (j = 0; j < A->ncol; ++j) c[j] = 0.; |
| /* Find the maximum element in each column, assuming the row |
| scalings computed above. */ |
| for (j = 0; j < A->ncol; ++j) |
| for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { |
| irow = Astore->rowind[i]; |
| c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] ); |
| } |
| /* Find the maximum and minimum scale factors. */ |
| rcmin = bignum; |
| rcmax = 0.; |
| for (j = 0; j < A->ncol; ++j) { |
| rcmax = SUPERLU_MAX(rcmax, c[j]); |
| rcmin = SUPERLU_MIN(rcmin, c[j]); |
| } |
| if (rcmin == 0.) { |
| /* Find the first zero scale factor and return an error code. */ |
| for (j = 0; j < A->ncol; ++j) |
| if ( c[j] == 0. ) { |
| *info = A->nrow + j + 1; |
| return; |
| } |
| } else { |
| /* Invert the scale factors. */ |
| for (j = 0; j < A->ncol; ++j) |
| c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); |
| /* Compute COLCND = min(C(J)) / max(C(J)) */ |
| *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); |
| } |
| return; |
| } /* dgsequ_dist */ |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/supermatrix.h |
|---|
| New file |
| 0,0 → 1,181 |
| /*! @file |
| * \brief Matrix type definitions |
| */ |
| #ifndef __SUPERLU_SUPERMATRIX /* allow multiple inclusions */ |
| #define __SUPERLU_SUPERMATRIX |
| /******************************************** |
| * The matrix types are defined as follows. * |
| ********************************************/ |
| typedef enum { |
| SLU_NC, /* column-wise, no supernode */ |
| SLU_NCP, /* column-wise, column-permuted, no supernode |
| (The consecutive columns of nonzeros, after permutation, |
| may not be stored contiguously.) */ |
| SLU_NR, /* row-wize, no supernode */ |
| SLU_SC, /* column-wise, supernode */ |
| SLU_SCP, /* supernode, column-wise, permuted */ |
| SLU_SR, /* row-wise, supernode */ |
| SLU_DN, /* Fortran style column-wise storage for dense matrix */ |
| SLU_NR_loc /* distributed compressed row format */ |
| } Stype_t; |
| typedef enum { |
| SLU_S, /* single */ |
| SLU_D, /* double */ |
| SLU_C, /* single complex */ |
| SLU_Z /* double complex */ |
| } Dtype_t; |
| typedef enum { |
| SLU_GE, /* general */ |
| SLU_TRLU, /* lower triangular, unit diagonal */ |
| SLU_TRUU, /* upper triangular, unit diagonal */ |
| SLU_TRL, /* lower triangular */ |
| SLU_TRU, /* upper triangular */ |
| SLU_SYL, /* symmetric, store lower half */ |
| SLU_SYU, /* symmetric, store upper half */ |
| SLU_HEL, /* Hermitian, store lower half */ |
| SLU_HEU /* Hermitian, store upper half */ |
| } Mtype_t; |
| typedef struct { |
| Stype_t Stype; /* Storage type: interprets the storage structure |
| pointed to by *Store. */ |
| Dtype_t Dtype; /* Data type. */ |
| Mtype_t Mtype; /* Matrix type: describes the mathematical property of |
| the matrix. */ |
| int_t nrow; /* number of rows */ |
| int_t ncol; /* number of columns */ |
| void *Store; /* pointer to the actual storage of the matrix */ |
| } SuperMatrix; |
| /*********************************************** |
| * The storage schemes are defined as follows. * |
| ***********************************************/ |
| /* Stype == SLU_NC (Also known as Harwell-Boeing sparse matrix format) */ |
| typedef struct { |
| int_t nnz; /* number of nonzeros in the matrix */ |
| void *nzval; /* pointer to array of nonzero values, packed by column */ |
| int_t *rowind; /* pointer to array of row indices of the nonzeros */ |
| int_t *colptr; /* pointer to array of beginning of columns in nzval[] |
| and rowind[] */ |
| /* Note: |
| Zero-based indexing is used; |
| colptr[] has ncol+1 entries, the last one pointing |
| beyond the last column, so that colptr[ncol] = nnz. */ |
| } NCformat; |
| /* Stype == SLU_NR */ |
| typedef struct { |
| int_t nnz; /* number of nonzeros in the matrix */ |
| void *nzval; /* pointer to array of nonzero values, packed by raw */ |
| int_t *colind; /* pointer to array of columns indices of the nonzeros */ |
| int_t *rowptr; /* pointer to array of beginning of rows in nzval[] |
| and colind[] */ |
| /* Note: |
| Zero-based indexing is used; |
| rowptr[] has nrow+1 entries, the last one pointing |
| beyond the last row, so that rowptr[nrow] = nnz. */ |
| } NRformat; |
| /* Stype == SLU_SC */ |
| typedef struct { |
| int_t nnz; /* number of nonzeros in the matrix */ |
| int_t nsuper; /* number of supernodes, minus 1 */ |
| void *nzval; /* pointer to array of nonzero values, packed by column */ |
| int_t *nzval_colptr;/* pointer to array of beginning of columns in nzval[] */ |
| int_t *rowind; /* pointer to array of compressed row indices of |
| rectangular supernodes */ |
| int_t *rowind_colptr;/* pointer to array of beginning of columns in rowind[] */ |
| int_t *col_to_sup; /* col_to_sup[j] is the supernode number to which column |
| j belongs; mapping from column to supernode number. */ |
| int_t *sup_to_col; /* sup_to_col[s] points to the start of the s-th |
| supernode; mapping from supernode number to column. |
| e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12) |
| sup_to_col: 0 1 2 4 7 12 (nsuper=4) */ |
| /* Note: |
| Zero-based indexing is used; |
| nzval_colptr[], rowind_colptr[], col_to_sup and |
| sup_to_col[] have ncol+1 entries, the last one |
| pointing beyond the last column. |
| For col_to_sup[], only the first ncol entries are |
| defined. For sup_to_col[], only the first nsuper+2 |
| entries are defined. */ |
| } SCformat; |
| /* Stype == SLU_SCP */ |
| typedef struct { |
| int_t nnz; /* number of nonzeros in the matrix */ |
| int_t nsuper; /* number of supernodes */ |
| void *nzval; /* pointer to array of nonzero values, packed by column */ |
| int_t *nzval_colbeg;/* nzval_colbeg[j] points to beginning of column j |
| in nzval[] */ |
| int_t *nzval_colend;/* nzval_colend[j] points to one past the last element |
| of column j in nzval[] */ |
| int_t *rowind; /* pointer to array of compressed row indices of |
| rectangular supernodes */ |
| int_t *rowind_colbeg;/* rowind_colbeg[j] points to beginning of column j |
| in rowind[] */ |
| int_t *rowind_colend;/* rowind_colend[j] points to one past the last element |
| of column j in rowind[] */ |
| int_t *col_to_sup; /* col_to_sup[j] is the supernode number to which column |
| j belongs; mapping from column to supernode. */ |
| int_t *sup_to_colbeg; /* sup_to_colbeg[s] points to the start of the s-th |
| supernode; mapping from supernode to column.*/ |
| int_t *sup_to_colend; /* sup_to_colend[s] points to one past the end of the |
| s-th supernode; mapping from supernode number to |
| column. |
| e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12) |
| sup_to_colbeg: 0 1 2 4 7 (nsuper=4) |
| sup_to_colend: 1 2 4 7 12 */ |
| /* Note: |
| Zero-based indexing is used; |
| nzval_colptr[], rowind_colptr[], col_to_sup and |
| sup_to_col[] have ncol+1 entries, the last one |
| pointing beyond the last column. */ |
| } SCPformat; |
| /* Stype == SLU_NCP */ |
| typedef struct { |
| int_t nnz; /* number of nonzeros in the matrix */ |
| void *nzval; /* pointer to array of nonzero values, packed by column */ |
| int_t *rowind;/* pointer to array of row indices of the nonzeros */ |
| /* Note: nzval[]/rowind[] always have the same length */ |
| int_t *colbeg;/* colbeg[j] points to the beginning of column j in nzval[] |
| and rowind[] */ |
| int_t *colend;/* colend[j] points to one past the last element of column |
| j in nzval[] and rowind[] */ |
| /* Note: |
| Zero-based indexing is used; |
| The consecutive columns of the nonzeros may not be |
| contiguous in storage, because the matrix has been |
| postmultiplied by a column permutation matrix. */ |
| } NCPformat; |
| /* Stype == SLU_DN */ |
| typedef struct { |
| int_t lda; /* leading dimension */ |
| void *nzval; /* array of size lda*ncol to represent a dense matrix */ |
| } DNformat; |
| /* Stype == SLU_NR_loc (Distributed Compressed Row Format) */ |
| typedef struct { |
| int_t nnz_loc; /* number of nonzeros in the local submatrix */ |
| int_t m_loc; /* number of rows local to this processor */ |
| int_t fst_row; /* global index of the first row */ |
| void *nzval; /* pointer to array of nonzero values, packed by row */ |
| int_t *rowptr; /* pointer to array of beginning of rows in nzval[] |
| and colind[] */ |
| int_t *colind; /* pointer to array of column indices of the nonzeros */ |
| /* Note: |
| Zero-based indexing is used; |
| rowptr[] has n_loc + 1 entries, the last one pointing |
| beyond the last row, so that rowptr[n_loc] = nnz_loc.*/ |
| } NRformat_loc; |
| #endif /* __SUPERLU_SUPERMATRIX */ |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/GetDiagU.c |
|---|
| New file |
| 0,0 → 1,107 |
| /*! @file |
| * \brief Extracts the main diagonal of matrix U |
| * |
| * <pre> |
| * -- Auxiliary routine in distributed SuperLU (version 1.0) -- |
| * Lawrence Berkeley National Lab, Univ. of California Berkeley. |
| * Xiaoye S. Li |
| * April 16, 2002 |
| * </pre> |
| */ |
| #include "superlu_ddefs.h" |
| /*! \brief |
| * |
| * <pre> |
| * Purpose |
| * ======= |
| * |
| * GetDiagU extracts the main diagonal of matrix U of the LU factorization. |
| * |
| * Arguments |
| * ========= |
| * |
| * n (input) int |
| * Dimension of the matrix. |
| * |
| * LUstruct (input) LUstruct_t* |
| * The data structures to store the distributed L and U factors. |
| * see superlu_ddefs.h for its definition. |
| * |
| * grid (input) gridinfo_t* |
| * The 2D process mesh. It contains the MPI communicator, the number |
| * of process rows (NPROW), the number of process columns (NPCOL), |
| * and my process rank. It is an input argument to all the |
| * parallel routines. |
| * |
| * diagU (output) double*, dimension (n) |
| * The main diagonal of matrix U. |
| * On exit, it is available on all processes. |
| * |
| * |
| * Note |
| * ==== |
| * |
| * The diagonal blocks of the L and U matrices are stored in the L |
| * data structures, and are on the diagonal processes of the |
| * 2D process grid. |
| * |
| * This routine is modified from gather_diag_to_all() in pdgstrs_Bglobal.c. |
| * </pre> |
| */ |
| void GetDiagU(int_t n, LUstruct_t *LUstruct, gridinfo_t *grid, double *diagU) |
| { |
| int_t *xsup; |
| int iam, knsupc, pkk; |
| int nsupr; /* number of rows in the block L(:,k) (LDA) */ |
| int_t i, j, jj, k, lk, lwork, nsupers, p; |
| int_t num_diag_procs, *diag_procs, *diag_len; |
| Glu_persist_t *Glu_persist = LUstruct->Glu_persist; |
| LocalLU_t *Llu = LUstruct->Llu; |
| double *dblock, *dwork, *lusup; |
| iam = grid->iam; |
| nsupers = Glu_persist->supno[n-1] + 1; |
| xsup = Glu_persist->xsup; |
| get_diag_procs(n, Glu_persist, grid, &num_diag_procs, |
| &diag_procs, &diag_len); |
| jj = diag_len[0]; |
| for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] ); |
| if ( !(dwork = doubleMalloc_dist(jj)) ) ABORT("Malloc fails for dwork[]"); |
| for (p = 0; p < num_diag_procs; ++p) { |
| pkk = diag_procs[p]; |
| if ( iam == pkk ) { |
| /* Copy diagonal into buffer dwork[]. */ |
| lwork = 0; |
| for (k = p; k < nsupers; k += num_diag_procs) { |
| knsupc = SuperSize( k ); |
| lk = LBj( k, grid ); |
| nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ |
| lusup = Llu->Lnzval_bc_ptr[lk]; |
| for (i = 0; i < knsupc; ++i) /* Copy the diagonal. */ |
| dwork[lwork+i] = lusup[i*(nsupr+1)]; |
| lwork += knsupc; |
| } |
| MPI_Bcast( dwork, lwork, MPI_DOUBLE, pkk, grid->comm ); |
| } else { |
| MPI_Bcast( dwork, diag_len[p], MPI_DOUBLE, pkk, grid->comm ); |
| } |
| /* Scatter dwork[] into global diagU vector. */ |
| lwork = 0; |
| for (k = p; k < nsupers; k += num_diag_procs) { |
| knsupc = SuperSize( k ); |
| dblock = &diagU[FstBlockC( k )]; |
| for (i = 0; i < knsupc; ++i) dblock[i] = dwork[lwork+i]; |
| lwork += knsupc; |
| } |
| } /* for p = ... */ |
| SUPERLU_FREE(diag_procs); |
| SUPERLU_FREE(diag_len); |
| SUPERLU_FREE(dwork); |
| } |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/Makefile |
|---|
| New file |
| 0,0 → 1,102 |
| ####################################################################### |
| # |
| # This makefile creates a library for distributed SuperLU. |
| # The files are organized as follows: |
| # |
| # ALLAUX -- Auxiliary routines called from all precisions |
| # DSLUSRC -- Double precision real serial SuperLU routines |
| # DPLUSRC -- Double precision real parallel SuperLU routines |
| # ZSLUSRC -- Double precision complex serial SuperLU routines |
| # ZPLUSRC -- Double precision complex parallel SuperLU routines |
| # |
| # The library can be set up to include routines for any combination |
| # of the two precisions. To create or add to the library, enter make |
| # followed by one or more of the precisions desired. Some examples: |
| # make double |
| # make double complex16 |
| # Alternatively, the command |
| # make |
| # without any arguments creates a library of all two precisions. |
| # The library is called |
| # superlu.a |
| # and is created at the next higher directory level. |
| # |
| # To remove the object files after the library is created, enter |
| # make clean |
| # |
| ####################################################################### |
| #include ../make.inc |
| include ../../../Makefile.def |
| DSUPERLULIB = $(DISTRIBUTED_SUPERLU_LIBRARY) |
| #DSUPERLULIB = a.lib |
| ARCH = $(AR) |
| ARCHFLAGS = $(ARFLAGS) |
| # |
| # Precision independent routines |
| # |
| ALLAUX = sp_ienv.o etree.o sp_colorder.o get_perm_c.o \ |
| mmd.o comm.o memory.o util.o superlu_grid.o \ |
| pxerbla.o superlu_timer.o GetDiagU.o mc64ad.o symbfact.o \ |
| psymbfact.o psymbfact_util.o get_perm_c_parmetis.o |
| #### LAPACK auxiliary routines |
| LAAUX = lsame.o xerbla.o slamch.o dlamch.o |
| # |
| # Routines literally taken from SuperLU |
| # |
| DSLUSRC = dlangs.o dgsequ.o dlaqgs.o dutil.o \ |
| dmemory.o dmyblas2.o dsp_blas2.o dsp_blas3.o |
| ZSLUSRC = dcomplex.o zlangs.o zgsequ.o zlaqgs.o zutil.o \ |
| zmemory.o zmyblas2.o dmemory.o zsp_blas2.o zsp_blas3.o |
| # |
| # Routines for double precision parallel SuperLU |
| # DPLUSRC = dldperm.o ddistribute.o pdgstrf.o pdgstrs_Bglobal.o |
| DPLUSRC = dldperm.o ddistribute.o pdgstrf.o pdgstrs_Bglobal.o \ |
| pdgstrs1.o pdgssvx_ABglobal.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o\ |
| pdgssvx.o pdgstrs.o pddistribute.o pdlangs.o pdutil.o \ |
| pdgsequ.o pdlaqgs.o pdgsrfs.o pdgsmv.o pdgstrs_lsum.o \ |
| pdsymbfact_distdata.o |
| # |
| # Routines for double complex parallel SuperLU |
| # ZPLUSRC = zldperm.o zdistribute.o pzgssvx_ABglobal.o pzgstrf_irecv.o |
| ZPLUSRC = zldperm.o zdistribute.o pzgssvx_ABglobal.o pzgstrf.o \ |
| pzgstrs1.o pzgstrs_Bglobal.o pzgsrfs_ABXglobal.o pzgsmv_AXglobal.o \ |
| pzgssvx.o pzgstrs.o pzdistribute.o pzlangs.o pzutil.o \ |
| pzgsequ.o pzlaqgs.o pzgsrfs.o pzgsmv.o pzgstrs_lsum.o \ |
| pzsymbfact_distdata.o |
| all: double |
| double: $(DSLUSRC) $(DPLUSRC) $(ALLAUX) $(LAAUX) |
| $(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \ |
| $(DSLUSRC) $(DPLUSRC) $(ALLAUX) $(LAAUX) |
| $(RANLIB) $(DSUPERLULIB) |
| complex16: $(ZSLUSRC) $(ZPLUSRC) $(ALLAUX) $(LAAUX) |
| $(ARCH) $(ARCHFLAGS) $(DSUPERLULIB) \ |
| $(ZSLUSRC) $(ZPLUSRC) $(ALLAUX) $(LAAUX) |
| $(RANLIB) $(DSUPERLULIB) |
| ################################## |
| # Do not optimize these routines # |
| ################################## |
| slamch.o: slamch.c ; $(CC) $(NOOPTS) $(CDEFS) -c $< |
| dlamch.o: dlamch.c ; $(CC) $(NOOPTS) $(CDEFS) -c $< |
| ################################## |
| .c.o: |
| $(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -c $< $(VERBOSE) |
| .f.o: |
| $(FORTRAN) $(FFLAGS) -c $< $(VERBOSE) |
| clean: |
| rm -f *.o $(DSUPERLULIB) |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/machines.h |
|---|
| New file |
| 0,0 → 1,53 |
| /*! @file |
| * \brief These macros define which machine will be used |
| * |
| * <pre> |
| * -- SuperLU MT routine (version 1.0) -- |
| * Univ. of California Berkeley, Xerox Palo Alto Research Center, |
| * and Lawrence Berkeley National Lab. |
| * August 15, 1997 |
| * |
| * These macros define which machine will be used. |
| * </pre> |
| */ |
| #ifndef __SUPERLU_MACHINES /* allow multiple inclusions */ |
| #define __SUPERLU_MACHINES |
| #define SGI 0 |
| #define ORIGIN 1 |
| #define DEC 2 |
| #define CRAY_T3E 3 |
| #define SUN 4 |
| #define PTHREAD 5 |
| #define IBM 6 |
| #ifdef _SGI |
| #define MACH SGI |
| #endif |
| #ifdef _ORIGIN |
| #define MACH ORIGIN |
| #endif |
| #ifdef _DEC |
| #define MACH DEC |
| #endif |
| #ifdef _CRAY |
| #define MACH CRAY_T3E |
| #endif |
| #ifdef _SOLARIS |
| #define MACH SUN |
| #endif |
| #ifdef _PTHREAD |
| #define MACH PTHREAD |
| #endif |
| #if ( defined(_SP2) || defined(_SP) ) |
| #define MACH IBM |
| #endif |
| #endif /* __SUPERLU_MACHINES */ |
| /trunk/OTHER/SuperLU_DIST_2.5/SRC/pdgsmv.c |
|---|
| New file |
| 0,0 → 1,373 |
| /*! @file |
| * \brief |
| * |
| * <pre> |
| * -- Distributed SuperLU routine (version 2.0) -- |
| * Lawrence Berkeley National Lab, Univ. of California Berkeley. |
| * March 15, 2003 |
| * </pre> |
| */ |
| #include <math.h> |
| #include "superlu_ddefs.h" |
| void pdgsmv_init |
| ( |
| SuperMatrix *A, /* Matrix A permuted by columns (input/output). |
| The type of A can be: |
| Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. */ |
| int_t *row_to_proc, /* Input. Mapping between rows and processes. */ |
| gridinfo_t *grid, /* Input */ |
| pdgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */ |
| ) |
| { |
| NRformat_loc *Astore; |
| int iam, p, procs; |
| int *SendCounts, *RecvCounts; |
| int_t i, j, k, l, m, m_loc, n, fst_row, jcol; |
| int_t TotalIndSend, TotalValSend; |
| int_t *colind, *rowptr; |
| int_t *ind_tosend = NULL, *ind_torecv = NULL; |
| int_t *ptr_ind_tosend, *ptr_ind_torecv; |
| int_t *extern_start, *spa, *itemp; |
| double *nzval, *val_tosend = NULL, *val_torecv = NULL, t; |
| MPI_Request *send_req, *recv_req; |
| MPI_Status status; |
| #if ( DEBUGlevel>=1 ) |
| CHECK_MALLOC(grid->iam, "Enter pdgsmv_init()"); |
| #endif |
| /* ------------------------------------------------------------ |
| INITIALIZATION. |
| ------------------------------------------------------------*/ |
| iam = grid->iam; |
| procs = grid->nprow * grid->npcol; |
| Astore = (NRformat_loc *) A->Store; |
| m = A->nrow; |
| n = A->ncol; |
| m_loc = Astore->m_loc; |
| fst_row = Astore->fst_row; |
| colind = Astore->colind; |
| rowptr = Astore->rowptr; |
| nzval = Astore->nzval; |
| if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) ) |
| ABORT("Malloc fails for SendCounts[]"); |
| /*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/ |
| RecvCounts = SendCounts + procs; |
| if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) ) |
| ABORT("Malloc fails for ptr_ind_tosend[]"); |
| ptr_ind_torecv = ptr_ind_tosend + procs + 1; |
| if ( !(extern_start = intMalloc_dist(m_loc)) ) |
| ABORT("Malloc fails for extern_start[]"); |
| for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i]; |
| /* ------------------------------------------------------------ |
| COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS. |
| THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS. |
| SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE |
| LOCAL PART OF X. |
| THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A. |
| ------------------------------------------------------------*/ |
| if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */ |
| ABORT("Malloc fails for spa[]"); |
| for (p = 0; p < procs; ++p) SendCounts[p] = 0; |
| for (i = 0; i < m_loc; ++i) { /* Loop through each row */ |
| k = extern_start[i]; |
| for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */ |
| jcol = colind[j]; |
| p = row_to_proc[jcol]; |
| if ( p != iam ) { /* External */ |
| if ( spa[jcol] == 0 ) { /* First time see this index */ |
| ++SendCounts[p]; |
| spa[jcol] = 1; |
| } |
| } else { /* Swap to beginning the part of A corresponding |
| to the local part of X */ |
| l = colind[k]; |
| t = nzval[k]; |
| colind[k] = jcol; |
| nzval[k] = nzval[j]; |
| colind[j] = l; |
| nzval[j] = t; |
| ++k; |
| } |
| } |
| extern_start[i] = k; |
| } |
| /* ------------------------------------------------------------ |
| LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES. |
| THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A. |
| ------------------------------------------------------------*/ |
| /* Build pointers to ind_tosend[]. */ |
| ptr_ind_tosend[0] = 0; |
| for (p = 0, TotalIndSend = 0; p < procs; ++p) { |
| TotalIndSend += SendCounts[p]; /* Total to send. */ |
| ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p]; |
| } |
| #if 0 |
| ptr_ind_tosend[iam] = 0; /* Local part of X */ |
| #endif |
| if ( TotalIndSend ) { |
| if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) ) |
| ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */ |
| } |
| /* Build SPA to aid global to local translation. */ |
| for (i = 0; i < n; ++i) spa[i] = EMPTY; |
| for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */ |
| for (j = rowptr[i]; j < rowptr[i+1]; ++j) { |
| jcol = colind[j]; |
| if ( spa[jcol] == EMPTY ) { /* First time see this index */ |
| p = row_to_proc[jcol]; |
| if ( p == iam ) { /* Local */ |
| /*assert(jcol>=fst_row);*/ |
| spa[jcol] = jcol - fst_row; /* Relative position in local X */ |
| } else { /* External */ |
| ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */ |
| spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */ |
| ++ptr_ind_tosend[p]; |
| } |
| } |
| } |
| } |
| /* ------------------------------------------------------------ |
| TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES. |
| THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A. |
| ------------------------------------------------------------*/ |
| for (i = 0; i < m_loc; ++i) { |
| for (j = rowptr[i]; j < rowptr[i+1]; ++j) { |
| jcol = colind[j]; |
| colind[j] = spa[jcol]; |
| } |
| } |
| /* ------------------------------------------------------------ |
| COMMUNICATE THE EXTERNAL INDICES OF X. |
| ------------------------------------------------------------*/ |
| MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT, |
| grid->comm); |
| /* Build pointers to ind_torecv[]. */ |
| ptr_ind_torecv[0] = 0; |
| for (p = 0, TotalValSend = 0; p < procs; ++p) { |
| TotalValSend += RecvCounts[p]; /* Total to receive. */ |
| ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p]; |
| } |
| if ( TotalValSend ) { |
| if ( !(ind_torecv = intMalloc_dist(TotalValSend)) ) |
| ABORT("Malloc fails for ind_torecv[]"); |
| } |
| if ( !(send_req = (MPI_Request *) |
| SUPERLU_MALLOC(2*procs *sizeof(MPI_Request)))) |
| ABORT("Malloc fails for recv_req[]."); |
| recv_req = send_req + procs; |
| for (p = 0; p < procs; ++p) { |
| ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */ |
| if ( SendCounts[p] ) { |
| MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p], |
| mpi_int_t, p, iam, grid->comm, &send_req[p]); |
| } |
| if ( RecvCounts[p] ) { |
| MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p], |
| mpi_int_t, p, p, grid->comm, &recv_req[p]); |
| } |
| } |
| for (p = 0; p < procs; ++p) { |
| if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status); |