CS255
Chris Pollett
Feb 18, 2015
Let A =[0, 0] be a global array of length 2
A = [0,0] SpawnSync(true, 0) SpawnSync(parent, location): 1 if(parent): spawn SpawnSync(false, location + 1) 2 A[location] = location + 1 3 sync 4 if(parent): for i =0 to A.length - 1: print A[i]
The above code should output:
1 2
import java.lang.Thread; public class SpawnSyncDemo extends Thread { public SpawnSyncDemo(SpawnSyncDemo spawner, int location) { this.location = location; this.spawner = spawner; done = false; } public void run() { SpawnSyncDemo child = null; done = false; if(spawner == null) { child = new SpawnSyncDemo(this, location + 1); child.start(); } a[location] = location + 1; done = true; if(child != null) { child.sync(); } if(spawner == null) { for(int i = 0; i < a.length; i++) { System.out.println(a[i]); } } } public synchronized void sync() { if(!done) { try { wait(); /*parent executes this code and waits until child thread completes */ } catch(InterruptedException ie) { ie.printStackTrace(); } } } public static void main(String args[]) { SpawnSyncDemo parent = new SpawnSyncDemo(null, 0); parent.start(); } int location; SpawnSyncDemo spawner; boolean done; static int a[] = {0, 0}; }
import java.lang.Thread; public class InnerProductDemo extends Thread { public InnerProductDemo(InnerProductDemo spawner, int low, int high) { this.high = high; this.low = low; this.spawner = spawner; done = false; c = 0; } public void run() { InnerProductDemo firstHalf = null; InnerProductDemo secondHalf = null; done = false; if(low == high) { c = a[low] * b[low]; } else { int mid = (low + high) / 2; firstHalf = new InnerProductDemo(this, low, mid); firstHalf.start(); secondHalf = new InnerProductDemo(this, mid + 1, high); secondHalf.start(); } if(low != high ){ firstHalf.sync(); secondHalf.sync(); c = firstHalf.c + secondHalf.c; } done = true; if(spawner == null) { System.out.println(c); } } public synchronized void sync() { if(!done) { try { wait(); } catch(InterruptedException ie) { ie.printStackTrace(); } } } public static void main(String args[]) { InnerProductDemo parent = new InnerProductDemo(null, 0, a.length - 1); parent.start(); } int low; int high; int c = 0; InnerProductDemo spawner; boolean done; static int a[] = {1, 1, 1, 1, 1}; static int b[] = {1, 1, 1, 1, 1}; }
In the below, the code that runs on the GPU is stored in the Java String programSource. OpenCL programs are essentially C programs with a few restrictions and some additional keywords and special functions. For example __global say that the variable is global to all processors, get_global_id(0) gets the id of the current processor. To use the GPU code we need to compile it and specify bindings between variables in GPU land and those in CPU land. Then we can execute it on inputs from the CPU and read the output compute back from the GPU to the CPU.
/* * JOCL - Java bindings for OpenCL * * Copyright 2009 Marco Hutter - http://www.jocl.org/ */ import static org.jocl.CL.*; import org.jocl.*; /** * A small JOCL sample. */ public class JOCLSample { /** * The source code of the OpenCL program to execute */ private static String programSource = "__kernel void "+ "sampleKernel(__global const float *a,"+ " __global const float *b,"+ " __global float *c)"+ "{"+ " int gid = get_global_id(0);"+ " c[gid] = a[gid] * b[gid];"+ "}"; /** * The entry point of this sample * * @param args Not used */ public static void main(String args[]) { // Create input- and output data int n = 10; float srcArrayA[] = new float[n]; float srcArrayB[] = new float[n]; float dstArray[] = new float[n]; for (int i=0; i<n; i++) { srcArrayA[i] = i; srcArrayB[i] = i; } Pointer srcA = Pointer.to(srcArrayA); Pointer srcB = Pointer.to(srcArrayB); Pointer dst = Pointer.to(dstArray); // The platform, device type and device number // that will be used final int platformIndex = 0; final long deviceType = CL_DEVICE_TYPE_ALL; final int deviceIndex = 0; // Enable exceptions and subsequently omit error checks in this sample CL.setExceptionsEnabled(true); // Obtain the number of platforms int numPlatformsArray[] = new int[1]; clGetPlatformIDs(0, null, numPlatformsArray); int numPlatforms = numPlatformsArray[0]; // Obtain a platform ID cl_platform_id platforms[] = new cl_platform_id[numPlatforms]; clGetPlatformIDs(platforms.length, platforms, null); cl_platform_id platform = platforms[platformIndex]; // Initialize the context properties cl_context_properties contextProperties = new cl_context_properties(); contextProperties.addProperty(CL_CONTEXT_PLATFORM, platform); // Obtain the number of devices for the platform int numDevicesArray[] = new int[1]; clGetDeviceIDs(platform, deviceType, 0, null, numDevicesArray); int numDevices = numDevicesArray[0]; // Obtain a device ID cl_device_id devices[] = new cl_device_id[numDevices]; clGetDeviceIDs(platform, deviceType, numDevices, devices, null); cl_device_id device = devices[deviceIndex]; // Create a context for the selected device cl_context context = clCreateContext( contextProperties, 1, new cl_device_id[]{device}, null, null, null); // Create a command-queue for the selected device cl_command_queue commandQueue = clCreateCommandQueue(context, device, 0, null); // Allocate the memory objects for the input- and output data cl_mem memObjects[] = new cl_mem[3]; memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * n, srcA, null); memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * n, srcB, null); memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE, Sizeof.cl_float * n, null, null); // Create the program from the source code cl_program program = clCreateProgramWithSource(context, 1, new String[]{ programSource }, null, null); // Build the program clBuildProgram(program, 0, null, null, null, null); // Create the kernel cl_kernel kernel = clCreateKernel(program, "sampleKernel", null); // Set the arguments for the kernel clSetKernelArg(kernel, 0, Sizeof.cl_mem, Pointer.to(memObjects[0])); clSetKernelArg(kernel, 1, Sizeof.cl_mem, Pointer.to(memObjects[1])); clSetKernelArg(kernel, 2, Sizeof.cl_mem, Pointer.to(memObjects[2])); // Set the work-item dimensions long global_work_size[] = new long[]{n}; long local_work_size[] = new long[]{1}; // Execute the kernel clEnqueueNDRangeKernel(commandQueue, kernel, 1, null, global_work_size, local_work_size, 0, null, null); // Read the output data clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE, 0, n * Sizeof.cl_float, dst, 0, null, null); // Release kernel, program, and memory objects clReleaseMemObject(memObjects[0]); clReleaseMemObject(memObjects[1]); clReleaseMemObject(memObjects[2]); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(commandQueue); clReleaseContext(context); // Verify the result boolean passed = true; final float epsilon = 1e-7f; for (int i=0; i<n; i++) { float x = dstArray[i]; float y = srcArrayA[i] * srcArrayB[i]; boolean epsilonEqual = Math.abs(x - y) <= epsilon * Math.abs(x); if (!epsilonEqual) { passed = false; break; } } System.out.println("Test "+(passed?"PASSED":"FAILED")); if (n <= 10) { System.out.println("Result: "+java.util.Arrays.toString(dstArray)); } } }
A computation in the PRAM model proceeds as a series of parallel steps:
We want to define complexity classes which characterize those problems which can be executed more efficiently if we have more processors.
Recall an alphabet, `Sigma`, is any fixed finite set. Let `Sigma^star` denote the set of strings over the alphabet `Sigma`. We define a language, `L`, over `Sigma` to be any `L subseteq Sigma^star`. The complement of a language `L`, `bar L` consists of those string `w` in `Sigma^star` but not in `L`.
Definition:
The class (uniform) NC consists of languages `L` that have a PRAM algorithm `A` such that for any `x in Sigma^star`,
The class RNC (where processors are allowed to use random coins) is defined the same way except we modify the first two conditions to:
The class ZNC is RNC`cap`co-RNC where co-RNC consists of those languages whose complement is in RNC.
For each of these classes we can define an analogous function/algorithm class. I.e., `FNC` is the class of functions `f: NN -> NN` which on inputs of length `n` produces an output of length bounded by `p(n)` where `p` is a polynomial, such that the `i`th bit of the output of `f` is language in `NC`. We sometimes abuse notation and write NC (resp. RNC, ZNC) for both NC and FNC (resp. RNC and FRNC, ZNC and FZNC).