#include #include #include #include #include #include #include #include #include #include using namespace std; #include "GPUMagneticFieldStepper.cu" #include "distToRT.cu" #include "wireLoc.cu" #include "DTrajectory.cu" #include "DTrajectoryPool.cu" #define trajectoryCount 3*11 //---------------- // TestGPUSwimmer //---------------- void TestGPUSwimmer(void) { // For each candidate we want to swim 11 tracks, one for the candidate // parameters then 2 for each element of the state vector corresponding // to +delta and -delta for that element. We use the same deltas each // time and the same mass hypotheses so copy those into device constant // memory once and allow the individual threads to calculate the starting // parameters etc. from them. The only things needed to be copied to // device memory on an event by event basis are the starting position, // momentum, and q of the candidate. // // Each candidate should have its kernel launched separately first, // then the results read. This can allow kernels for multiple candidates // to run simultaneously. // Create list of mass hypotheses and setup GPU for tracking vector mass_hypotheses; mass_hypotheses.push_back(0.13965); mass_hypotheses.push_back(0.542); mass_hypotheses.push_back(0.938); GPUInitTracking(mass_hypotheses); // Create a set of track candidates vector candidates; candidates.push_back( MakeStateThetaPhi( 5.0/57.3, 0.0, 1.0, 0.0, 0.0, 65.0, -1.0) ); candidates.push_back( MakeStateThetaPhi(10.0/57.3, 10.0, 1.0, 0.0, 0.0, 65.0, +1.0) ); candidates.push_back( MakeStateThetaPhi(15.0/57.3, 150.0, 1.0, 0.0, 0.0, 65.0, -1.0) ); candidates.push_back( MakeStateThetaPhi(55.0/57.3, 250.0, 0.5, 0.0, 0.0, 65.0, +1.0) ); // Allocate TrajectoryBlocks cout<<"Allocating TrajectoryBlocks..."< trajectory_blocks; for(unsigned int i=0; iN,0,tb->stream>>>(candidates[i], tb->d); } cudaDeviceSynchronize(); // synchronize all streams // Test code for wires for(int i = 0; i < candidates.size(); i++) { TrajectoryBlock *tb = trajectory_blocks[i]; // Get TrajectoryBlock to use for this candidate int HITCOUNT = 24; // TEMPORARY (should come from file!) int* wireIndexes_d; //Device array of wires indexes for wires hit int wireIndexes[HITCOUNT+1]; //First element will store number of (wire) hits wireIndexes[0] = HITCOUNT; for(int j = 1; j < HITCOUNT+1; j++) { wireIndexes[j] = j+10; // TEMPORARY!!!! } //int block_size = 256; //Recommended for block size, may need changed later //int n_blocks = N/block_size + (N%block_size == 0 ? 0:1); //For non-full blocks cudaMalloc((void **) &wireIndexes_d, sizeof(int) * (HITCOUNT+1)); cudaMemcpy(wireIndexes, wireIndexes_d, sizeof(int) * (HITCOUNT+1), cudaMemcpyHostToDevice); // cudaError_t error = cudaMemcpyToSymbol(wireIndexes_d_symbol, &wireIndexes_d, sizeof(int*), size_t(0),cudaMemcpyHostToDevice); int N = HITCOUNT; // number of wires hit int M = tb->N; // number of trajectories int NxM = N * M; int max = N * NxM + M; // max index possible so that extra threads are ignored in the last incomplete block int num_elements_x = N; int num_elements_y = M; dim3 block_size; //threads per block block_size.x = 256; //new 2-d system block_size.y = 256; dim3 grid_size; //block count grid_size.x = num_elements_x / block_size.x + (num_elements_x % block_size.x == 0 ? 0:1); //for extra threads to handle uneven blocks. Need to add if-statements to catch extra threads! grid_size.y = num_elements_y / block_size.y + (num_elements_y % block_size.y == 0 ? 0:1); int* closestTrajectoryPoints_d; cudaMalloc((void **) &closestTrajectoryPoints_d, sizeof(int) * M * N); returnClosestTrajectoryPoint <<< grid_size, block_size >>>(tb->d, wireIndexes_d, closestTrajectoryPoints_d, NxM, max); //DEBUG cout << "Printing out closest wire indices for each trajectory point" << endl; int closestTrajectoryPoints[M*N]; cudaMemcpy(closestTrajectoryPoints_d, closestTrajectoryPoints, sizeof(int) * M * N, cudaMemcpyDeviceToHost); for(int i = 0; i < M*N; i++) { cout << closestTrajectoryPoints[i] << endl; } //returning negative values, there must be an error above here! Using fabs() on the distance didn't fix it, so not sure if it's needed. //DEBUG float* closestApproach_d; //Closest Trajectory Approach to Wire for each trajectory - wire combo cudaMalloc((void **) &closestApproach_d, sizeof(float) * M * N); distToRT <<< grid_size, block_size >>>(tb->d, closestTrajectoryPoints_d, wireIndexes_d, closestApproach_d, NULL, NxM, max); //DEBUG cout << "Printing out closest approach values for each wire/trajectory point combo" << endl; int closestApproach[M*N]; cudaMemcpy(closestApproach_d, closestApproach, sizeof(int) * M * N, cudaMemcpyDeviceToHost); for(int i = 0; i < M*N; i++) { cout << closestApproach[i] << endl; } //DEBUG float* wireResiduals_d; //Sum distances from all the trajectories to each wire cudaMalloc((void **) &wireResiduals_d, sizeof(float) * N); //in matrix, sum all the wire residuals sumWireResiduals <<< grid_size, block_size >>>(NxM, closestApproach_d, wireResiduals_d, max); } // Free TrajectoryBlocks for(unsigned int i=0; i> length >> pos_x >> pos_y >> pos_z >> dir_x >> dir_y >> dir_z; a_h[wireCount].id = wireCount; a_h[wireCount].L = length; a_h[wireCount].x = pos_x; a_h[wireCount].y = pos_y; a_h[wireCount].z = pos_z; a_h[wireCount].i = dir_x; a_h[wireCount].j = dir_y; a_h[wireCount].k = dir_z; wireCount++; } } file.close(); // Initialize wire arrays /* int index; for(int i = 0; i < 24; i++) //Planes { if(i % 2 == 0) //Check if even { for (int j = 0; j < 100; j++) //Wires vertical { index = j + (i * 100); a_h[index].id = j + i*100; //Measurements in cm a_h[index].x = j + 1; a_h[index].y = 0; a_h[index].z = i * 5; a_h[index].i = 0; a_h[index].j = 1; a_h[index].k = 0; a_h[index].L = 1; //a_h[index].ptr = NULL; } } else { for (int j = 0; j < 100; j++) //Wires horizontal { index = j + (i * 100); a_h[index].id = j + i*100; //Measurements in cm a_h[index].x = 0; a_h[index].y = j + 1; a_h[index].z = i * 5; a_h[index].i = 1; a_h[index].j = 0; a_h[index].k = 0; a_h[index].L = 1; //a_h[index].ptr = NULL; } } } */ cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice); //Copy to device cudaError_t error = cudaMemcpyToSymbol(a_dd, &a_d, sizeof(Wire*), size_t(0),cudaMemcpyHostToDevice); //printf("CUDA Error: %s\n", cudaGetErrorString(error)); int threadCount = 1; int block_size = 4; int n_blocks = threadCount/block_size + (threadCount%block_size == 0 ? 0:1); returnWire <<< n_blocks, block_size >>> (tempD, 3); //ID of 3 for testing cudaMemcpy(tempH, tempD, sizeof(Wire), cudaMemcpyDeviceToHost); printWire(tempH); returnWire <<< n_blocks, block_size >>> (tempD, 4); //ID of 4 for testing cudaMemcpy(tempH, tempD, sizeof(Wire), cudaMemcpyDeviceToHost); printWire(tempH); returnWire <<< n_blocks, block_size >>> (tempD, 5); //ID of 5 for testing cudaMemcpy(tempH, tempD, sizeof(Wire), cudaMemcpyDeviceToHost); printWire(tempH); returnWire <<< n_blocks, block_size >>> (tempD, 500); //ID of 500 for testing cudaMemcpy(tempH, tempD, sizeof(Wire), cudaMemcpyDeviceToHost); printWire(tempH); returnWire <<< n_blocks, block_size >>> (tempD, 1043); //ID of 1043 for testing cudaMemcpy(tempH, tempD, sizeof(Wire), cudaMemcpyDeviceToHost); printWire(tempH); //printf("0x%x\n", a_d); // Cleanup free(a_h); free(tempH); cudaFree(a_d); //DELETE AFTER? cudaFree(tempD); } //---------------- // main //---------------- int main(int narg, char *argv[]) { InitCUDA(); //DTrajectoryPool *pool = new DTrajectoryPool(1); //Needs arg to avoid error TestWireLoc(); TestGPUSwimmer(); return 0; }