PTile – A parallel parametric tiling software for imperfectly nested loops


PTile is a software tool to generate parallel parametric tiled code (where the generated tiled code is parallel and tile sizes are symbolic constants that can be set during run-time) for affine loop nests: loop nests whose bounds and array access functions are affine functions of loop-iterators and program parameters. This software is based on the paper: “Parameterized Tiling Revisited”, which was published in CGO'10.

Download

PTile has been integrated into the Polyhedral Compiler Collection (PoCC) and can be downloaded from PoCC website.

Usage

To generate parallel parameterized tiled code, enclose the loop-nest that you would like to tile with the pragma directives: #pragma scop and #pragma endscop
and invoke 'pocc' in one of the following two ways:

$pocc --pluto --ptile <input-file-name>
When pocc is invoked with --pluto option, the skewing transformation is applied if necessary to make the input loop nest tilable and --ptile option uses PTile module to generate parametric tiled code.

$pocc --pluto --ptile-fts <input-file-name>
When --ptile-fts option is used, PTile generates parametric tiled code with full tiles separated.

Additionally,  --punroll may be used to unroll the loop and --past-hoist-lb to hoist the loop-bound expressions for increased performance.

Example

Input Code: gemm.c


#pragma scop

  /* C := alpha*A*B + beta*C */
  for (i = 0; i < ni; i++)
    for (j = 0; j < nj; j++)
      {
    C[i][j] *= beta;
    for (k = 0; k < nk; ++k)
      C[i][j] += alpha * A[i][k] * B[k][j];
      }

#pragma endscop



Output Codes:

$pocc --pluto --ptile gemm.c


#include <math.h>
#ifdef ceild
# undef ceild
#endif
#ifdef floord
# undef floord
#endif
#ifdef max
# undef max
#endif
#ifdef min
# undef min
#endif
#define ceild(n,d)  ceil(((double)(n))/((double)(d)))
#define floord(n,d) floor(((double)(n))/((double)(d)))
#define max(x,y)    ((x) > (y)? (x) : (y))
#define min(x,y)    ((x) < (y)? (x) : (y))


  register int lbv, ubv, lb, ub, lb1, ub1, lb2, ub2;
  register int c1t1, c2t1, c1, c2, c3t1, c3;

#pragma scop
if (((ni >= 1) && (nj >= 1))) {
  lb1 = round((-1 + (1 / T1c1)));
  ub1 = round(((ni * (1 / T1c1)) + ((1 / T1c1) * -1)));
#pragma omp parallel for private(c2t1, c1, c2) firstprivate(lb1, ub1)
    for (c1t1 = lb1; c1t1 <= ub1; ++(c1t1)) {
        for (c2t1 = round((-1 + (1 / T1c2))); c2t1 <= round(((nj * (1 / T1c2)) + ((1 / T1c2) * -1))); ++(c2t1)) {
            for (c1 = max((c1t1 * T1c1), 0); c1 <= min(((c1t1 * T1c1) + (T1c1 + -1)), (ni + -1)); c1++) {
                for (c2 = max((c2t1 * T1c2), 0); c2 <= min(((c2t1 * T1c2) + (T1c2 + -1)), (nj + -1)); c2++) {
          C[c1][c2]*=beta;
        }
      }
    }
  }
    if ((nk >= 1)) {
    lb1 = round((-1 + (1 / T1c1)));
    ub1 = round(((ni * (1 / T1c1)) + ((1 / T1c1) * -1)));
#pragma omp parallel for private(c2t1, c3t1, c1, c2, c3) firstprivate(lb1, ub1)
        for (c1t1 = lb1; c1t1 <= ub1; ++(c1t1)) {
            for (c2t1 = round((-1 + (1 / T1c2))); c2t1 <= round(((nj * (1 / T1c2)) + ((1 / T1c2) * -1))); ++(c2t1)) {
                for (c3t1 = round((-1 + (1 / T1c3))); c3t1 <= round(((nk * (1 / T1c3)) + ((1 / T1c3) * -1))); ++(c3t1)) {
                    for (c1 = max((c1t1 * T1c1), 0); c1 <= min(((c1t1 * T1c1) + (T1c1 + -1)), (ni + -1)); c1++) {
                        for (c2 = max((c2t1 * T1c2), 0); c2 <= min(((c2t1 * T1c2) + (T1c2 + -1)), (nj + -1)); c2++) {
                            for (c3 = max((c3t1 * T1c3), 0); c3 <= min(((c3t1 * T1c3) + (T1c3 + -1)), (nk + -1)); c3++) {
                C[c1][c2]+=alpha*A[c1][c3]*B[c3][c2];
              }
            }
          }
        }
      }
    }
  }
}
#pragma endscop


$pocc --pluto --ptile-fts gemm.c

#include <math.h>
#ifdef ceild
# undef ceild
#endif
#ifdef floord
# undef floord
#endif
#ifdef max
# undef max
#endif
#ifdef min
# undef min
#endif
#define ceild(n,d)  ceil(((double)(n))/((double)(d)))
#define floord(n,d) floor(((double)(n))/((double)(d)))
#define max(x,y)    ((x) > (y)? (x) : (y))
#define min(x,y)    ((x) < (y)? (x) : (y))


  register int lbv, ubv, lb, ub, lb1, ub1, lb2, ub2;
  register int c1t1, c2t1, c1, c2, c3t1, c3;

#pragma scop
if (((ni >= 1) && (nj >= 1))) {
  lb1 = round((-1 + (1 / T1c1)));
  ub1 = round(((ni * (1 / T1c1)) + ((1 / T1c1) * -1)));
#pragma omp parallel for private(c2t1, c1, c2) firstprivate(lb1, ub1)
    for (c1t1 = lb1; c1t1 <= ub1; ++(c1t1)) {
        for (c2t1 = round((-1 + (1 / T1c2))); c2t1 <= round(((nj * (1 / T1c2)) + ((1 / T1c2) * -1))); ++(c2t1)) {
            if (((((c1t1 * T1c1) >= 0) && ((c1t1 * T1c1) + (T1c1 + -1)) <= (ni + -1)) && (((c2t1 * T1c2) >= 0) && ((c2t1 * T1c2) + (T1c2 + -1)) <= (nj + -1)))) {
                for (c1 = (c1t1 * T1c1); c1 <= ((c1t1 * T1c1) + (T1c1 + -1)); ++(c1)) {
                    for (c2 = (c2t1 * T1c2); c2 <= ((c2t1 * T1c2) + (T1c2 + -1)); ++(c2)) {
            C[c1][c2]*=beta;
          }
        }
      }
      else {
                for (c1 = max((c1t1 * T1c1), 0); c1 <= min(((c1t1 * T1c1) + (T1c1 + -1)), (ni + -1)); c1++) {
                    for (c2 = max((c2t1 * T1c2), 0); c2 <= min(((c2t1 * T1c2) + (T1c2 + -1)), (nj + -1)); c2++) {
            C[c1][c2]*=beta;
          }
        }
      }
    }
  }
    if ((nk >= 1)) {
    lb1 = round((-1 + (1 / T1c1)));
    ub1 = round(((ni * (1 / T1c1)) + ((1 / T1c1) * -1)));
#pragma omp parallel for private(c2t1, c3t1, c1, c2, c3) firstprivate(lb1, ub1)
        for (c1t1 = lb1; c1t1 <= ub1; ++(c1t1)) {
            for (c2t1 = round((-1 + (1 / T1c2))); c2t1 <= round(((nj * (1 / T1c2)) + ((1 / T1c2) * -1))); ++(c2t1)) {
                for (c3t1 = round((-1 + (1 / T1c3))); c3t1 <= round(((nk * (1 / T1c3)) + ((1 / T1c3) * -1))); ++(c3t1)) {
                    if ((((((c1t1 * T1c1) >= 0) && ((c1t1 * T1c1) + (T1c1 + -1)) <= (ni + -1)) && (((c2t1 * T1c2) >= 0) && ((c2t1 * T1c2) + (T1c2 + -1)) <= (nj + -1))) && (((c3t1 * T1c3) >= 0) && ((c3t1 * T1c3) + (T1c3 + -1)) <= (nk + -1)))) {
                        for (c1 = (c1t1 * T1c1); c1 <= ((c1t1 * T1c1) + (T1c1 + -1)); ++(c1)) {
                            for (c2 = (c2t1 * T1c2); c2 <= ((c2t1 * T1c2) + (T1c2 + -1)); ++(c2)) {
                                for (c3 = (c3t1 * T1c3); c3 <= ((c3t1 * T1c3) + (T1c3 + -1)); ++(c3)) {
                  C[c1][c2]+=alpha*A[c1][c3]*B[c3][c2];
                }
              }
            }
          }
          else {
                        for (c1 = max((c1t1 * T1c1), 0); c1 <= min(((c1t1 * T1c1) + (T1c1 + -1)), (ni + -1)); c1++) {
                            for (c2 = max((c2t1 * T1c2), 0); c2 <= min(((c2t1 * T1c2) + (T1c2 + -1)), (nj + -1)); c2++) {
                                for (c3 = max((c3t1 * T1c3), 0); c3 <= min(((c3t1 * T1c3) + (T1c3 + -1)), (nk + -1)); c3++) {
                  C[c1][c2]+=alpha*A[c1][c3]*B[c3][c2];
                }
              }
            }
          }
        }
      }
    }
  }
}
#pragma endscop