PTile – A parallel parametric tiling software for imperfectly nested loops
PTile is a software tool to generate parallel parametric tiled code (where the generated tiled code is parallel and tile sizes are symbolic constants that can be set during run-time) for affine loop nests: loop nests whose bounds and array access functions are affine functions of loop-iterators and program parameters. This software is based on the paper: “Parameterized Tiling Revisited”, which was published in CGO'10.
Download
PTile has been integrated into the Polyhedral Compiler Collection (PoCC) and can be downloaded from PoCC website.
Usage
To
generate parallel parameterized tiled code, enclose
the loop-nest that you would like to tile with the pragma directives: #pragma scop and
#pragma endscop
and
invoke 'pocc' in one of the following two ways:
$pocc
--pluto --ptile <input-file-name>
When
pocc is invoked with --pluto option, the skewing transformation is
applied if necessary to make the input loop nest tilable and --ptile
option uses PTile module to generate parametric tiled code.
$pocc
--pluto --ptile-fts <input-file-name>
When
--ptile-fts option is used, PTile generates parametric tiled code
with full tiles separated.
Additionally, --punroll may be used to unroll the loop and --past-hoist-lb to hoist the loop-bound expressions for increased performance.
Example
Input Code: gemm.c
#pragma scop /* C := alpha*A*B + beta*C */ for (i = 0; i < ni; i++) for (j = 0; j < nj; j++) { C[i][j] *= beta; for (k = 0; k < nk; ++k) C[i][j] += alpha * A[i][k] * B[k][j]; } #pragma endscop |
Output Codes:
$pocc --pluto --ptile gemm.c
#include <math.h> #ifdef ceild # undef ceild #endif #ifdef floord # undef floord #endif #ifdef max # undef max #endif #ifdef min # undef min #endif #define ceild(n,d) ceil(((double)(n))/((double)(d))) #define floord(n,d) floor(((double)(n))/((double)(d))) #define max(x,y) ((x) > (y)? (x) : (y)) #define min(x,y) ((x) < (y)? (x) : (y)) register int lbv, ubv, lb, ub, lb1, ub1, lb2, ub2; register int c1t1, c2t1, c1, c2, c3t1, c3; #pragma scop if (((ni >= 1) && (nj >= 1))) { lb1 = round((-1 + (1 / T1c1))); ub1 = round(((ni * (1 / T1c1)) + ((1 / T1c1) * -1))); #pragma omp parallel for private(c2t1, c1, c2) firstprivate(lb1, ub1) for (c1t1 = lb1; c1t1 <= ub1; ++(c1t1)) { for (c2t1 = round((-1 + (1 / T1c2))); c2t1 <= round(((nj * (1 / T1c2)) + ((1 / T1c2) * -1))); ++(c2t1)) { for (c1 = max((c1t1 * T1c1), 0); c1 <= min(((c1t1 * T1c1) + (T1c1 + -1)), (ni + -1)); c1++) { for (c2 = max((c2t1 * T1c2), 0); c2 <= min(((c2t1 * T1c2) + (T1c2 + -1)), (nj + -1)); c2++) { C[c1][c2]*=beta; } } } } if ((nk >= 1)) { lb1 = round((-1 + (1 / T1c1))); ub1 = round(((ni * (1 / T1c1)) + ((1 / T1c1) * -1))); #pragma omp parallel for private(c2t1, c3t1, c1, c2, c3) firstprivate(lb1, ub1) for (c1t1 = lb1; c1t1 <= ub1; ++(c1t1)) { for (c2t1 = round((-1 + (1 / T1c2))); c2t1 <= round(((nj * (1 / T1c2)) + ((1 / T1c2) * -1))); ++(c2t1)) { for (c3t1 = round((-1 + (1 / T1c3))); c3t1 <= round(((nk * (1 / T1c3)) + ((1 / T1c3) * -1))); ++(c3t1)) { for (c1 = max((c1t1 * T1c1), 0); c1 <= min(((c1t1 * T1c1) + (T1c1 + -1)), (ni + -1)); c1++) { for (c2 = max((c2t1 * T1c2), 0); c2 <= min(((c2t1 * T1c2) + (T1c2 + -1)), (nj + -1)); c2++) { for (c3 = max((c3t1 * T1c3), 0); c3 <= min(((c3t1 * T1c3) + (T1c3 + -1)), (nk + -1)); c3++) { C[c1][c2]+=alpha*A[c1][c3]*B[c3][c2]; } } } } } } } } #pragma endscop |
$pocc
--pluto --ptile-fts gemm.c
#include <math.h> #ifdef ceild # undef ceild #endif #ifdef floord # undef floord #endif #ifdef max # undef max #endif #ifdef min # undef min #endif #define ceild(n,d) ceil(((double)(n))/((double)(d))) #define floord(n,d) floor(((double)(n))/((double)(d))) #define max(x,y) ((x) > (y)? (x) : (y)) #define min(x,y) ((x) < (y)? (x) : (y)) register int lbv, ubv, lb, ub, lb1, ub1, lb2, ub2; register int c1t1, c2t1, c1, c2, c3t1, c3; #pragma scop if (((ni >= 1) && (nj >= 1))) { lb1 = round((-1 + (1 / T1c1))); ub1 = round(((ni * (1 / T1c1)) + ((1 / T1c1) * -1))); #pragma omp parallel for private(c2t1, c1, c2) firstprivate(lb1, ub1) for (c1t1 = lb1; c1t1 <= ub1; ++(c1t1)) { for (c2t1 = round((-1 + (1 / T1c2))); c2t1 <= round(((nj * (1 / T1c2)) + ((1 / T1c2) * -1))); ++(c2t1)) { if (((((c1t1 * T1c1) >= 0) && ((c1t1 * T1c1) + (T1c1 + -1)) <= (ni + -1)) && (((c2t1 * T1c2) >= 0) && ((c2t1 * T1c2) + (T1c2 + -1)) <= (nj + -1)))) { for (c1 = (c1t1 * T1c1); c1 <= ((c1t1 * T1c1) + (T1c1 + -1)); ++(c1)) { for (c2 = (c2t1 * T1c2); c2 <= ((c2t1 * T1c2) + (T1c2 + -1)); ++(c2)) { C[c1][c2]*=beta; } } } else { for (c1 = max((c1t1 * T1c1), 0); c1 <= min(((c1t1 * T1c1) + (T1c1 + -1)), (ni + -1)); c1++) { for (c2 = max((c2t1 * T1c2), 0); c2 <= min(((c2t1 * T1c2) + (T1c2 + -1)), (nj + -1)); c2++) { C[c1][c2]*=beta; } } } } } if ((nk >= 1)) { lb1 = round((-1 + (1 / T1c1))); ub1 = round(((ni * (1 / T1c1)) + ((1 / T1c1) * -1))); #pragma omp parallel for private(c2t1, c3t1, c1, c2, c3) firstprivate(lb1, ub1) for (c1t1 = lb1; c1t1 <= ub1; ++(c1t1)) { for (c2t1 = round((-1 + (1 / T1c2))); c2t1 <= round(((nj * (1 / T1c2)) + ((1 / T1c2) * -1))); ++(c2t1)) { for (c3t1 = round((-1 + (1 / T1c3))); c3t1 <= round(((nk * (1 / T1c3)) + ((1 / T1c3) * -1))); ++(c3t1)) { if ((((((c1t1 * T1c1) >= 0) && ((c1t1 * T1c1) + (T1c1 + -1)) <= (ni + -1)) && (((c2t1 * T1c2) >= 0) && ((c2t1 * T1c2) + (T1c2 + -1)) <= (nj + -1))) && (((c3t1 * T1c3) >= 0) && ((c3t1 * T1c3) + (T1c3 + -1)) <= (nk + -1)))) { for (c1 = (c1t1 * T1c1); c1 <= ((c1t1 * T1c1) + (T1c1 + -1)); ++(c1)) { for (c2 = (c2t1 * T1c2); c2 <= ((c2t1 * T1c2) + (T1c2 + -1)); ++(c2)) { for (c3 = (c3t1 * T1c3); c3 <= ((c3t1 * T1c3) + (T1c3 + -1)); ++(c3)) { C[c1][c2]+=alpha*A[c1][c3]*B[c3][c2]; } } } } else { for (c1 = max((c1t1 * T1c1), 0); c1 <= min(((c1t1 * T1c1) + (T1c1 + -1)), (ni + -1)); c1++) { for (c2 = max((c2t1 * T1c2), 0); c2 <= min(((c2t1 * T1c2) + (T1c2 + -1)), (nj + -1)); c2++) { for (c3 = max((c3t1 * T1c3), 0); c3 <= min(((c3t1 * T1c3) + (T1c3 + -1)), (nk + -1)); c3++) { C[c1][c2]+=alpha*A[c1][c3]*B[c3][c2]; } } } } } } } } } #pragma endscop |