/* Parallel implementation of the matrix multiplication program.
	NOTE: This program runs on n number of nodes and uses the block 
	multiplication technique.
	The second matrix remains on the root node and the first matrix is 
	divided into n parts and each part is sent to the node.
	Each node computes the result of the multiplication and sends it
	back to the server.
	Thus, the result is collected back on the server.
*/

#include <stdio.h>
#include "mpi.h"

int main(int argc, char *argv[])
{
	int rank, nproc, namelen,i,j,k,start,end,tag,unit,msize;
	int **matrix,**matrix1,**matrix2;
	char processorname[MPI_MAX_PROCESSOR_NAME];
	double starttime = 0.0, endtime = 0.0;
	MPI_Status s;

	MPI_Init(&argc,&argv);

	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &nproc);
	MPI_Get_processor_name(processorname,&namelen);

	if(argv[1]==NULL)
	{	
		if(rank==0)
		printf("Usage: matrix [size]\n");

		MPI_Finalize();
		return 1;
	}
	msize= atoi(argv[1]);
	unit = msize/nproc;

	matrix= (int **) malloc(msize * sizeof(int *));
	matrix1= (int **) malloc(msize * sizeof(int *));
	matrix2= (int **) malloc(msize * sizeof(int *));

	for(i=0;i<msize;i++){
	matrix1[i] = (int *) malloc(msize * sizeof(int));
	matrix2[i] = (int *) malloc(msize * sizeof(int));
	matrix[i] = (int *) malloc(msize * sizeof(int));
	}

	if(rank==0){
	// Initializing the matrices with random values on the root node

	printf("\nGenerating two random matrices of size %d.....",msize);
	for(i=0;i<msize;i++){
		for(j=0;j<msize;j++){
			matrix1[i][j] = (rand() % 90)+10;
			matrix2[i][j] = (rand() % 90)+10;
			matrix[i][j] = 0;
				}
			}

/*	// Routine to print the matrix values
	
	for(i=0;i<msize;i++){
		for(j=0;j<msize;j++){
			printf("%d ",matrix1[i][j]);
			}
		printf("       ");
		for(j=0;j<msize;j++){	
			printf("%d ",matrix2[i][j]);
				}
		printf("\n");
		}
*/
	start = 0;
	end = (msize/nproc);
	printf("\nMultiplying the two matrices.....\n");
	starttime = MPI_Wtime();

	MPI_Bcast(&matrix2[0][0],(msize*msize*3)+40,MPI_INT,0,MPI_COMM_WORLD);
	for(i=0;i<nproc;i++){
		start = unit * i;
		end = unit * (i+1);
		if(i!=0){
		MPI_Send(&start,1,MPI_INT,i,1,MPI_COMM_WORLD);
		MPI_Send(&end,1,MPI_INT,i,2,MPI_COMM_WORLD);
		MPI_Send(&matrix1[start][0],(msize*unit*3)+8,MPI_INT,i,3,MPI_COMM_WORLD);
		}
		}

	}
	else
	{
		MPI_Bcast(&matrix2[0][0],(msize*msize*3)+40,MPI_INT,0,MPI_COMM_WORLD);
		MPI_Recv(&start,1,MPI_INT,0,1,MPI_COMM_WORLD,&s);
		MPI_Recv(&end,1,MPI_INT,0,2,MPI_COMM_WORLD,&s);
		MPI_Recv(&matrix1[start][0],(msize*unit*3)+8,MPI_INT,0,3,MPI_COMM_WORLD,&s);
	}

//	printf("\nProcess %d of %d on %s\n", rank,nproc, processorname);
//	printf("start = %d end = %d",start,end);

	start = unit * rank;
	end = unit * (rank+1);

	MPI_Barrier(MPI_COMM_WORLD);

	for(i=start;i<end;i++) {
		for(j=0;j<msize;j++) {
			for(k=0;k<msize;k++) {
			matrix[i][j] = matrix[i][j]+(matrix1[i][k] * matrix2[k][j]);
			}
		}
	}

	if(rank!=0)
	{
	MPI_Send(&matrix[start][0],(msize*unit*2)+6,MPI_INT,0,7,MPI_COMM_WORLD);
	}

	if(rank==0){
	for(i=1;i<nproc;i++){
		start = unit * i;
		if(i!=0){
		MPI_Recv(&matrix[start][0],(msize*unit*2)+6,MPI_INT,i,7,MPI_COMM_WORLD,&s);
	}
	}
	endtime = MPI_Wtime();

/*	// Function to display the final resultant matrix.
	printf("\n");
	for(i=0;i<msize;i++){
	for(j=0;j<msize;j++){
		printf("%d ",matrix[i][j]);
		}
	printf("\n");
	}
*/
	free(matrix1);
	free(matrix2);
	printf("\nClock time:  %f \n",endtime-starttime);
}

	free(matrix);
	MPI_Finalize();
	return 0;
}
