public static void unaryAggregate()

in src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java [583:904]


	public static void unaryAggregate(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String output, AggregateUnaryOperator op) {
		if (ec.getGPUContext(0) != gCtx)
			throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function");
		if(LOG.isTraceEnabled()) {
			LOG.trace("GPU : unaryAggregate" + ", GPUContext=" + gCtx);
		}
		final int REDUCTION_ALL = 1;
		final int REDUCTION_ROW = 2;
		final int REDUCTION_COL = 3;
		final int REDUCTION_DIAG = 4;

		// A kahan sum implemention is not provided. is a "uak+" or other kahan operator is encountered,
		// it just does regular summation reduction.
		final int OP_PLUS = 1;
		final int OP_PLUS_SQ = 2;
		final int OP_MEAN = 3;
		final int OP_VARIANCE = 4;
		final int OP_MULTIPLY = 5;
		final int OP_MAX = 6;
		final int OP_MIN = 7;
		final int OP_MAXINDEX = 8;
		final int OP_MININDEX = 9;


		// Sanity Checks
		if(!in1.getGPUObject(gCtx).isAllocated())
			throw new DMLRuntimeException("Internal Error - The input is not allocated for a GPU Aggregate Unary:" + in1.getGPUObject(gCtx).isAllocated());

		boolean isSparse = in1.getGPUObject(gCtx).isSparse();
		IndexFunction indexFn = op.indexFn;
		AggregateOperator aggOp = op.aggOp;

		// Convert Reduction direction to a number
		int reductionDirection = -1;
		if (indexFn instanceof ReduceAll){
			reductionDirection = REDUCTION_ALL;
		} else if (indexFn instanceof ReduceRow){
			reductionDirection = REDUCTION_ROW;
		} else if (indexFn instanceof ReduceCol){
			reductionDirection = REDUCTION_COL;
		} else if (indexFn instanceof ReduceDiag){
			reductionDirection = REDUCTION_DIAG;
		} else {
			throw new DMLRuntimeException("Internal Error - Invalid index function type, only reducing along rows, columns, diagonals or all elements is supported in Aggregate Unary operations");
		}
		if(reductionDirection == -1)
			throw new DMLRuntimeException("Internal Error - Incorrect type of reduction direction set for aggregate unary GPU instruction");

		// Convert function type to a number
		int opIndex = -1;
		if (aggOp.increOp.fn instanceof KahanPlus) {
			opIndex = OP_PLUS;
		} else if (aggOp.increOp.fn instanceof KahanPlusSq) {
			opIndex = OP_PLUS_SQ;
		} else if (aggOp.increOp.fn instanceof Mean) {
			opIndex = OP_MEAN;
		} else if (aggOp.increOp.fn instanceof CM) {
			if(((CM)aggOp.increOp.fn).getAggOpType() != CMOperator.AggregateOperationTypes.VARIANCE)
				throw new DMLRuntimeException("Internal Error - Invalid Type of CM operator for Aggregate Unary operation on GPU");
			opIndex = OP_VARIANCE;
		} else if (aggOp.increOp.fn instanceof Plus) {
			opIndex = OP_PLUS;
		} else if (aggOp.increOp.fn instanceof Multiply) {
			opIndex = OP_MULTIPLY;
		} else if (aggOp.increOp.fn instanceof Builtin) {
			Builtin b = (Builtin)aggOp.increOp.fn;
			switch(b.bFunc) {
			case MAX: opIndex = OP_MAX; break;
			case MIN: opIndex = OP_MIN; break;
			case MAXINDEX: opIndex = OP_MAXINDEX; break;
			case MININDEX: opIndex = OP_MININDEX;break;
			default:
				throw new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU");
			}
		} else {
			throw new DMLRuntimeException("Internal Error - Aggregate operator has invalid Value function");
		}
		if(opIndex == -1)
			throw new DMLRuntimeException("Internal Error - Incorrect type of operation set for aggregate unary GPU instruction");

		int rlen = (int)in1.getNumRows();
		int clen = (int)in1.getNumColumns();
		if (isSparse){
			// The strategy for the time being is to convert sparse to dense
			// until a sparse specific kernel is written.
			in1.getGPUObject(gCtx).sparseToDense(instName);
			// long nnz = in1.getNnz();
			// assert nnz > 0 : "Internal Error - number of non zeroes set to " + nnz + " in Aggregate Binary for GPU";
			// MatrixObject out = ec.getSparseMatrixOutputForGPUInstruction(output, nnz);
			// throw new DMLRuntimeException("Internal Error - Not implemented");

		}

		long outRLen = -1;
		long outCLen = -1;
		if (indexFn instanceof ReduceRow) { // COL{SUM, MAX...}
			outRLen = 1;
			outCLen = clen;
		}
		else if (indexFn instanceof ReduceCol) { // ROW{SUM, MAX,...}
			outRLen = rlen;
			outCLen = 1;
		}

		Pointer out = null;
		if (reductionDirection == REDUCTION_COL || reductionDirection == REDUCTION_ROW) {
			// Matrix output
			MatrixObject out1 = getDenseMatrixOutputForGPUInstruction(ec, instName, output, outRLen, outCLen);
			out = getDensePointer(gCtx, out1, instName);
		}

		Pointer in = getDensePointer(gCtx, in1, instName);
		int size = rlen * clen;

		// For scalars, set the scalar output in the Execution Context object
		switch (opIndex){
		case OP_PLUS: {
			switch(reductionDirection) {
			case REDUCTION_ALL : {
				double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
				ec.setScalarOutput(output, new DoubleObject(result));
				break;
			}
			case REDUCTION_COL : {	// The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
				reduceRow(gCtx, instName, "reduce_row_sum", in, out, rlen, clen);
				break;
			}
			case REDUCTION_ROW : {
				reduceCol(gCtx, instName, "reduce_col_sum", in, out, rlen, clen);
				break;
			}
			case REDUCTION_DIAG :
				throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet");
			}
			break;
		}
		case OP_PLUS_SQ : {
			// Calculate the squares in a temporary object tmp
			Pointer tmp = gCtx.allocate(instName, (long) size * sizeOfDataType, false);

			squareMatrix(gCtx, instName, in, tmp, rlen, clen);
			// Then do the sum on the temporary object and free it
			switch(reductionDirection) {
			case REDUCTION_ALL : {
				double result = reduceAll(gCtx, instName, "reduce_sum", tmp, size);
				ec.setScalarOutput(output, new DoubleObject(result));
				break;
			}
			case REDUCTION_COL : {	// The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
				reduceRow(gCtx, instName, "reduce_row_sum", tmp, out, rlen, clen);
				break;
			}
			case REDUCTION_ROW : {
				reduceCol(gCtx, instName, "reduce_col_sum", tmp, out, rlen, clen);
				break;
			}
			default:
				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared");
			}
			gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE);
			break;
		}
		case OP_MEAN:{
			switch(reductionDirection) {
			case REDUCTION_ALL: {
				double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
				double mean = result / size;
				ec.setScalarOutput(output, new DoubleObject(mean));
				break;
			}
			case REDUCTION_COL: {
				reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
				break;
			}
			case REDUCTION_ROW: {
				reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
				break;
			}
			default:
				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean");
			}
			break;
		}
		case OP_MULTIPLY : {
			switch (reductionDirection) {
			case REDUCTION_ALL: {
				double result = reduceAll(gCtx, instName, "reduce_prod", in, size);
				ec.setScalarOutput(output, new DoubleObject(result));
				break;
			}
			default:
				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication");
			}
			break;
		}
		case OP_MAX :{
			switch(reductionDirection) {
			case REDUCTION_ALL: {
				double result = reduceAll(gCtx, instName, "reduce_max", in, size);
				ec.setScalarOutput(output, new DoubleObject(result));
				break;
			}
			case REDUCTION_COL: {
				reduceRow(gCtx, instName, "reduce_row_max", in, out, rlen, clen);
				break;
			}
			case REDUCTION_ROW: {
				reduceCol(gCtx, instName, "reduce_col_max", in, out, rlen, clen);
				break;
			}
			default:
				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
			}
			break;
		}
		case OP_MIN :{
			switch(reductionDirection) {
			case REDUCTION_ALL: {
				double result = reduceAll(gCtx, instName, "reduce_min", in, size);
				ec.setScalarOutput(output, new DoubleObject(result));
				break;
			}
			case REDUCTION_COL: {
				reduceRow(gCtx, instName, "reduce_row_min", in, out, rlen, clen);
				break;
			}
			case REDUCTION_ROW: {
				reduceCol(gCtx, instName, "reduce_col_min", in, out, rlen, clen);
				break;
			}
			default:
				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
			}
			break;
		}
		case OP_VARIANCE : {
			// Temporary GPU array for
			Pointer tmp = gCtx.allocate(instName, (long) size * sizeOfDataType, false);
			Pointer tmp2 = gCtx.allocate(instName, (long) size * sizeOfDataType, false);

			switch(reductionDirection) {

			case REDUCTION_ALL: {
				double result = reduceAll(gCtx, instName, "reduce_sum", in, size);
				double mean = result / size;

				// Subtract mean from every element in the matrix
				ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean);
				matrixScalarOp(gCtx, instName, in, mean, rlen, clen, tmp, minusOp);

				squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);

				double result2 = reduceAll(gCtx, instName, "reduce_sum", tmp2, size);
				double variance = result2 / (size - 1);
				ec.setScalarOutput(output, new DoubleObject(variance));

				break;
			}
			case REDUCTION_COL: {
				reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen);
				// Subtract the row-wise mean from every element in the matrix
				BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
				matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);

				squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);

				Pointer tmpRow = gCtx.allocate(instName, (long) rlen * sizeOfDataType, false);
				reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);

				ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
				matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp);

				gCtx.cudaFreeHelper(instName, tmpRow, DMLScript.EAGER_CUDA_FREE);

				break;
			}
			case REDUCTION_ROW: {
				reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen);
				// Subtract the columns-wise mean from every element in the matrix
				BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
				matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);

				squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen);

				Pointer tmpCol = gCtx.allocate(instName, (long) clen * sizeOfDataType, false);
				reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);

				ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
				matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp);

				gCtx.cudaFreeHelper(instName, tmpCol, DMLScript.EAGER_CUDA_FREE);

				break;
			}
			default:
				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
			}
			gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE);
			gCtx.cudaFreeHelper(instName, tmp2, DMLScript.EAGER_CUDA_FREE);
			break;
		}
		case OP_MAXINDEX : {
			switch(reductionDirection) {
			case REDUCTION_COL:
				throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU ");
			default:
				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex");
			}
			// break;
		}
		case OP_MININDEX : {
			switch(reductionDirection) {
			case REDUCTION_COL:
				throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU ");
			default:
				throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex");
			}
			// break;
		}
		default : throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
		}
	}