! Parallel matrix multiplication main program
! Fernando G. Tinetti
!
! Expected usage:
! myparmm_1
! Multiplies square matrices size x size elements
!
! Based on the algorithm given in http://ftinetti.googlepages.com/phdthesis
!
! Uses as many processes as started through mpi, and on each process
! ompthreads are used, "assuming" SMP/multicore computers.
!
! This "version 1" is that of non-overlapped computing and communication
!
! mpi functions and constants (does not work here...)
!include "mpif.h"
! **********************************************************
PROGRAM MAIN
! The main program just set up things (command line parameters checking,
! mpi initialization, etc.) and calls the computing function.
! mpi functions and constants
include "mpif.h"
! vars related to command line parameters
! argc: number of real command line parameters (expected to be 2)
! size: square matrices size
! thre: number of openmp threads requested
! buffer: just an intermediate place to get command line values
INTEGER :: argc, size, thre
CHARACTER *100 buffer
! vars related to mpi information/setting
INTEGER :: rank, nprocs, ierr, sour, dest, mess_size
INTEGER :: status(MPI_STATUS_SIZE)
! verify command line parameters (minimum)
argc = IARGC()
IF (.not. (argc .eq. 2)) THEN
PRINT *,"usage: myparmm_1 "
CALL EXIT(0)
END IF
! get command line parameters into local vars.
CALL GETARG(1,buffer) ! Get matrix size (1/2)
READ(buffer,*) size ! Get matrix size (2/2)
CALL GETARG(2,buffer) ! Get number of threads (1/2)
READ(buffer,*) thre ! Get number of threads (2/2)
! init mpi "environment" (get the environment, in fact)
CALL MPI_INIT(ierr)
CALL MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr)
CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nprocs, ierr)
IF (rank .eq. 0) THEN
PRINT *, "Matrices size: ", size, "x", size, " elements"
PRINT *, "Number of MPI processes: ", nprocs
PRINT *, "Number of OpenMP threads per MPI process: ", thre
PRINT *, "Memory for data in M(10^6)B: ", 4 * size * size/nprocs / 1000000 * 4
PRINT *, "Each Bcast is ", size * size/nprocs / 1000000 * 4, "M(10^6)B"
END IF
CALL calculo(size, nprocs, thre, rank)
CALL MPI_FINALIZE(ierr)
END PROGRAM MAIN
! **********************************************************
SUBROUTINE calculo(n, np, th, myrank)
! n: square matrix size
! np: total number of processes
! th: number of threads to create/use
! myr: rank of "this" process
! mpi functions and constants
include "mpif.h"
IMPLICIT NONE
! formal parameters
INTEGER:: n, np, th, myrank
! local submatrices
REAL, DIMENSION(n/np, n):: a, c ! distributed by row submatrices
REAL, DIMENSION(n, n/np):: b ! distributed by column submatrices
! buffer to recv b submatrices from other processes, also used on
! every partial matrix multiplication, b data is always here for mm
REAL, DIMENSION(n, n/np):: bwrk
! local vars
INTEGER:: i, ii, jj, kk, begcol, bwrkcol, ierr
! init matrices a and b such that c(i, j) = n
a = 1.0
b = 1.0
bwrk = 0.0
! set the number of threads for openMP (created later at OMP PARALLEL)
! on production code, this should be set to the number of CPUs/host
CALL OMP_SET_NUM_THREADS(th)
! main loop: each process bcast the corresponding b data
! and makes partial c computing on each iteration
DO i = 1, np
! bwrk should have the b submatrix needed in this iteration
IF ((i-1) .eq. myrank) THEN ! processes ids from 0
bwrk = b ! local submatrix used in this iteration
END IF
!PRINT *, "Bcast with root ", i-1, "process ", myrank
CALL MPI_BCAST(bwrk, n*n/np, MPI_REAL, i-1, MPI_COMM_WORLD, ierr)
IF (ierr .eq. MPI_SUCCESS) THEN
! PRINT *, "Bcast with root ", i-1, " ok on process ", myrank
ELSE
PRINT *, "Bcast with root ", i-1, " error on process ", myrank
END IF
! now do the partial computing of c submatrix, advance on 1st row
! by n/np columns, which correspond to the number of b columns used
begcol = (i-1)*n/np + 1 ! begins according to the i_th b "submatrix"
CALL mymatmul(a, bwrk, c(1, begcol), n/np, n/np, n, th)
END DO
! Just checking
DO ii = 1, n/np
DO jj = 1, n
IF (.not. (c(ii, jj) .eq. n)) THEN
PRINT *, "Error on value of c(", ii, ", ", jj, ") on ", myrank, ": ", c(ii, jj)
END IF
END DO
END DO
PRINT *, "No problems found at c if nothing reported on ", myrank
END SUBROUTINE calculo
! **********************************************************
! should resemble and finally call (just like a wrapper)
! SUBROUTINE S/DGEMM(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
SUBROUTINE mymatmul(a, b, c, m, n, k, th)
! compute c = a x b (a, b, and c are matrices)
! a(m, k): first (matrix) operand
! b(k, n): second (matrix) operand
! c(m, n): result (matrix)
! th: number of openMP threads expected to be used...
IMPLICIT NONE
! formal parameters
REAL, DIMENSION(m, k):: a
REAL, DIMENSION(k, n):: b
REAL, DIMENSION(m, n):: c
INTEGER :: m, n, k, th
! local vars. for local matmul
INTEGER :: ii, jj, kk
! local vars. for calling SGEMM
REAL :: ALPHA, BETA
CHARACTER :: TRANSA, TRANSB
! local vars. for distributing matmul in openMP threads
INTEGER :: threadi, cols_per_thr, initcol, endcol
! BLAS SGEMM subroutine, at Apr. 2nd there is not any BLAS installed...
ALPHA = 1.0
BETA = 0.0
TRANSA = 'N'
TRANSB = 'N'
CALL SGEMM(TRANSA, TRANSB, m, n, k, ALPHA, a, m, b, k, BETA, c, m)
END SUBROUTINE mymatmul
! **********************************************************