#ifndef __DMAGNETICFIELDSTEPPER_CU__ #define __DMAGNETICFIELDSTEPPER_CU__ #include #include using namespace std; #include "cutil_math.h" #include "SimpleException.h" #include "GPUMagneticFieldStepper.h" #include "wireLoc.cu" //extern Wire* wires_d; //Wire array device symbol __device__ __constant__ unsigned int N_STATE_DELTAS; __device__ __constant__ state_delta_t *STATE_DELTAS; //---------------- // GPUInitTracking //---------------- void GPUInitTracking(vector &mass_hypotheses) { /// This sets up the GPU to be used for tracking. It /// includes copying the B-field into the GPU device /// and setting up some info for multiple track swims. /// More specifically: /// Set up the complete set of state_delta_t vectors /// that will be used during tracking. Memory is /// allocated on the device to hold these for the life /// of the program. See GPU_SwimMultiple for details. cudaError_t err; CopyBfieldMapToGPU(); // Number of variations of the starting parameters to // swim. The 11 is from: // 1 for no variation // 5 for each parameter +delta // 5 for each parameter -delta unsigned int n_state_deltas = mass_hypotheses.size()*11; // Allocate host memory to hold state deltas int SDsize = n_state_deltas*sizeof(state_delta_t); state_delta_t *state_deltas_h = (state_delta_t*)malloc(SDsize); // Allocate device memory to hold state deltas state_delta_t *state_deltas_d; (state_delta_t*)cudaMalloc(&state_deltas_d, SDsize); // delta values from which state deltas are derived float dphi = 0.001; // 1 mrad float dx = 1.0E-2; // 100 microns in cm float dq_over_p = 1.0/12.0/10.0; // 1/10th of largest possible momentum float sin_dphi = sinf(dphi); float cos_dphi = cosf(dphi); // Fill in values of state deltas for each mass state_delta_t *sd = state_deltas_h; for(unsigned int i=0; iCpx*xdir + sd->Cpy*ydir + sd->Cpz*zdir; pos += sd->Cx*xdir + sd->Cy*ydir; S.q_over_p += sd->Cq_over_p; S.mass = sd->mass; // Get pointer to location to store results trajectory_t *traj = &trajectories[idx]; // Swim using the modified state GPU_Swim(S, traj); } } //---------------- // GPU_Swim //---------------- __device__ void GPU_Swim(state_t S, trajectory_t *traj) { /// Swim a particle from the given starting position and /// momentum to through the magnetic field, storing each /// step in the given trajectory_t structure. // Setup first step to be starting position int &Nsteps = traj->Nsteps; state_t *step = traj->steps; *step = S; Nsteps=1; // Loop until done for(; Nsteps < traj->max_steps; Nsteps++){ // Copy last state into this one as starting point step[1] = *step++; GPU_Step(*step); float r2 = step->pos.x*step->pos.x + step->pos.y*step->pos.y; bool done = false; done |= step->pos.z > 390.0; done |= step->pos.z < 17.0; done |= r2 > 3360.0; if(done)break; } } //---------------- // GPU_GetStepSize //---------------- __device__ float GPU_GetStepSize(state_t &S) { /// Return the maximum step size for a particle in /// the given state. /// For now, this just returns 1cm return 1.0; // cm } //---------------- // GPU_Step //---------------- __device__ void GPU_Step(state_t &S) { /// Perform a 4th order Runge-Kutta step to advance /// the particle represented by the given state. This /// makes multiple calls to the TestStep routine. The /// values in S are updated to reflect the new state. float3 &pos = S.pos; float3 &mom = S.mom; // Get step size based on material float h = GPU_GetStepSize(S); // cm // Do 4th order Runge-Kutta (inspired) float3 dpos1, dpos2, dpos3, dpos4; float3 mom1, mom2, mom3, mom4; float4 B1, B2, B3, B4; GPU_GetBfield(pos, &B1); GPU_TestStep(h, S, B1, &dpos1, &mom1, &B2, 0.5); // field at mid-point 1 GPU_TestStep(h, S, B2, &dpos2, &mom2, &B3, 0.5); // field at mid-point 2 GPU_TestStep(h, S, B3, &dpos3, &mom3, &B4, 1.0); // field at end-point GPU_TestStep(h, S, B4, &dpos4, &mom4); // (no field needed) float3 delta_pos = (float)one_sixth*(dpos1 + (float)2.0*dpos2 + (float)2.0*dpos3 + dpos4); float3 new_mom = (float)one_sixth*( mom1 + 2.0f* mom2 + 2.0f* mom3 + mom4); pos += delta_pos; mom = new_mom; // direction only // Adjust for energy loss (not yet implemented correctly!) float &q_over_p = S.q_over_p; float &mass = S.mass; float delta_E = -0.000001*h; // energy lost this step in GeV float one_over_p = fabs(q_over_p); float mass_over_p = mass * one_over_p; float delta_one_over_p = sqrt(1.0 + mass_over_p*mass_over_p); delta_one_over_p *= q_over_p*one_over_p * delta_E; // sign depends on charge delta_one_over_p *= (float)(q_over_p!=0.0); // keep value identically zero if it came in that way q_over_p -= delta_one_over_p; } //---------------- // GPU_TestStep //---------------- __device__ void GPU_TestStep(float h, state_t &Sstart, float4 &B, float3 *delta_pos, float3 *new_mom, float4 *Bmid, float h_frac) { /// Propagate the parameters given in Sstart forward /// a distance "h" along a helical trajectory determined /// by the field given in "B". The position and momentum /// change to the end of the step are returned in /// delta_pos and delta_mom. In addition, if Bmid is not /// NULL, then the field is calculated at a /// point on the helix at h_frac of the full step size. /// e.g. if h_frac=0, the field at the starting point /// is returned. If h_frac=0.5, the field at the half-way /// point along the helical step is returned. This is done since /// it is quicker to do this here when the natural /// coordinates are already calculated for the given field. float3 &pdir = Sstart.mom; float q_over_p = Sstart.q_over_p; float &Bmag = B.w; // Natural coordinate system of reference trajectory float3 &xdir = Sstart.xdir; float3 &ydir = Sstart.ydir; float3 &zdir = Sstart.zdir; zdir = make_float3(B); // discards w which is used to hold magnitude xdir = normalize(cross(pdir, zdir)); ydir = cross(zdir, xdir); // Theta is angle between momentum vector and B-field // direction. In natural coordinates, p is in y/z // plane. Use dot and cross products to get cosine // and sine respectively. float cos_theta = dot(pdir, zdir); // could be + or - float sin_theta = dot(pdir, ydir); // + definite // Rotation angle cooresponding to this helical step. // This is not immediately intuitive (at least to me) // but it is derived by considering: // // R * dphi = h * sin(theta) // // and // // qBr * q*Bmag*R = p_t // // solving the second equation for R in terms of // p*sin(theta) = p_t and inserting into the first // gives the below relation for dphi. float sin_theta_over_R = qBr2p * q_over_p * Bmag; // sign depends only on q // It's possible that the value of q_over_p passed in // is zero or that the magnitude of the field is zero. // In either case, sin_theta_over_R will then be zero // leading to problems below. Check here if this is the // case and if so, shift by a tiny amount to avoid dividing // by zero. (This essentially handles straight-line tracks.) float epsilon = 1.0E-8; sin_theta_over_R += (float)(sin_theta_over_R==0.0)*epsilon; // prevent sin_theta_over_R from being exactly zero float dphi = h * sin_theta_over_R; // should be small float sin_dphi = sinf(dphi); // sign depends only on q for small dphi float cos_dphi = cosf(dphi); // + definite for small dphi // Radius of curvature // The radius of curvature comes from qBr=pt where "pt" // is transverse momentum. float R = sin_theta/sin_theta_over_R; // + or - depending on q // Rotate momentum vector about natural z-axis float &pr = sin_theta; // + definite float px = +pr*sin_dphi; // + or - depending on q float py = +pr*cos_dphi; // should always be + float pz = cos_theta; // z-component doesn't change *new_mom = px*xdir + py*ydir + pz*zdir; // Rotate position about center of helix float dx = R*(1.0-cos_dphi); // + or - depending on q float dy = R*sin_dphi; // + definite (since both terms depend on q) float dz = h*cos_theta; // + or - depending on direction of p relative to B *delta_pos = dx*xdir + dy*ydir + dz*zdir; // Calculate field at fractional step (if specified) if(Bmid != NULL){ dphi *= h_frac; sin_dphi = sin(dphi); cos_dphi = cos(dphi); dx = R*(1.0-cos_dphi); dy = R*sin_dphi; dz *= h_frac; float3 pos = Sstart.pos + dx*xdir + dy*ydir + dz*zdir; GPU_GetBfield(pos, Bmid); } // In priciple, all threads should be in sync here, but in // case they aren't, re-sync them now. __syncthreads(); } //---------------- // GPU_SwimSingle //---------------- __global__ void GPU_SwimSingle(float q, float mass, float3 start_pos, float3 start_mom, trajectory_t *traj) { /// This is a host-callable kernel that will swim /// a single track. state_t S; float px = start_mom.x; float py = start_mom.x; float pz = start_mom.x; float p = sqrt(px*px + py*py + pz*pz); px /= p; py /= p; pz /= p; S.mom = make_float3(px, py, pz); S.pos = start_pos; S.q_over_p = q/p; S.mass = mass; GPU_Swim(S, traj); } //---------------- // MakeTrajectoryBlock //---------------- TrajectoryBlock* MakeTrajectoryBlock(unsigned int N) { /// Create a TrajectoryBlock structure, allocating /// both host and device memory to hold the /// given number of trajectory_t structures. Memory /// is also allocated on the device to hold all of /// the trajectory steps. The relevant pointers are /// all set in the structures so that the "d" field /// of the trajectory block can be passed into the /// relevant GPU tracking routines used for processing /// multiple tracks. cudaError_t err; // Allocate memory for the trajectory block structure itself TrajectoryBlock *tb = (TrajectoryBlock*)malloc(sizeof(TrajectoryBlock)); tb->N = N; // Allocate enough memory on the device to hold all steps // for all trajectories in the block unsigned int max_steps = 10000.0; unsigned int size_steps = max_steps*sizeof(state_t); state_t *steps; err = cudaMalloc(&steps, N*size_steps); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); // Allocate memory on host to hold trajectory_t structures // and fill in fields tb->h = (trajectory_t*)malloc(N*sizeof(trajectory_t)); trajectory_t *traj = tb->h; for(unsigned int i=0; iNsteps = 0; traj->max_steps = max_steps; traj->steps = &steps[i*max_steps]; } // Allocate memory on device to hold trajectory_t structures // and copy values from host over. err = cudaMalloc(&tb->d, N*sizeof(trajectory_t)); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaMemcpy(tb->d, tb->h, N*sizeof(trajectory_t), cudaMemcpyHostToDevice); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); // Create a dedicate cuda stream to use with this TrajectoryBlock. // This will allow multiple TrajectoryBlocks to be filled // simultaneously. err = cudaStreamCreate(&tb->stream); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); // return pointer to TrajectoryBlock return tb; } //---------------- // FreeTrajectoryBlock //---------------- void FreeTrajectoryBlock(TrajectoryBlock *tb) { cudaError_t err; if(!tb)throw SimpleException(string("ERROR: trying to free NULL TrajectoryBlock pointer")); cudaStreamDestroy(tb->stream); err = cudaFree(tb->h->steps); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); free(tb->h); err = cudaFree(tb->d); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); free(tb); } //---------------- // Swim //---------------- void Swim(float q, float mass, float3 start_pos, float3 start_mom, trajectory_t *&traj) { /// This is primarily intended for testing. It is a host resident /// routine that will call the device swimmer, copying the entire /// trajectory from the device back to the host. If the given /// traj pointer reference has a value of NULL, then a trajectory /// structure will be allocated along with the memory to hold the /// steps. The caller may either destroy this, or recycle it on /// a subsequent call. // Allocate memory on host for trajectory if(traj == NULL){ traj = (trajectory_t*)malloc(sizeof(trajectory_t)); traj->Nsteps = 0; traj->max_steps = 0; traj->steps = NULL; } if(traj->steps==NULL){ traj->max_steps = 10; traj->steps = (state_t*)malloc(sizeof(state_t)*traj->max_steps); } // Allocate memory on device for trajectory trajectory_t *d_traj; state_t *d_steps; cudaMalloc(&d_traj, sizeof(trajectory_t)); cudaMalloc(&d_steps, sizeof(state_t)*traj->max_steps); // Make a copy of the host-resident trajectory_t structure so we can // replace the pointer to "steps" with the device resident steps. trajectory_t tmp = *traj; tmp.steps = d_steps; // Copy trajectory stucture to device memory cudaMemcpy(d_traj, &tmp, sizeof(trajectory_t), cudaMemcpyHostToDevice); // Call device swimmer GPU_SwimSingle<<<1,1>>>(q, mass, start_pos, start_mom, d_traj); // Copy trajectory stucture back from device memory cudaMemcpy(&tmp, d_traj, sizeof(trajectory_t), cudaMemcpyDeviceToHost); // Copy steps back from device memory traj->Nsteps = tmp.Nsteps; if(traj->Nsteps<1 || traj->Nsteps>traj->max_steps){ throw SimpleException("Bad number of steps reutrned from GPU_Swim"); } cudaMemcpy(traj->steps, d_steps, sizeof(state_t)*traj->Nsteps, cudaMemcpyDeviceToHost); } //returnClosestTrajectoryPoint __global__ void returnClosestTrajectoryPoint(trajectory_t *trajectories, int* wireIndexes_d, int* closestTrajectoryPoints_d, int NxM, int max) { int idx_i = blockIdx.x * blockDim.x + threadIdx.x; int idx_j = blockIdx.y * blockDim.y + threadIdx.y; int wireIndex = wireIndexes_d[idx_i + 1]; //Can't use zero trajectory_t trajectory = trajectories[idx_j]; float vals[2] = {99999}; int ptr[2]; //index of trajectory points for(int i = 0; i < trajectory.Nsteps; i++) { float x = wires_d[wireIndex].x - trajectory.steps[i].pos.x; float y = wires_d[wireIndex].y - trajectory.steps[i].pos.y; float z = wires_d[wireIndex].z - trajectory.steps[i].pos.z; float iN = wires_d[wireIndex].i; float jN = wires_d[wireIndex].j; float kN = wires_d[wireIndex].k; float dotP = x * iN + y * jN + z * kN; x = x - (dotP * iN); y = y - (dotP * jN); z = z - (dotP * kN); vals[1] = sqrt(x * x + y * y + z * z); ptr[1] = i; int idx = vals[0] > vals[1]; //1 or 0 based on min or not vals[0] = vals[idx]; //Set new min ptr[0] = ptr[idx]; } //min distance point should be in ptr[0] int finalIndex = idx_j + idx_i * NxM; //row major order closestTrajectoryPoints_d[finalIndex] = ptr[0]; //printf("Data: %d", ptr[0]); //printf("\nThe data was copied to index: %d", finalIndex); } //sumWireResiduals __global__ void sumWireResiduals(int NxM, float *closestApproach_d, float *wireResiduals_d, int max) { int idx_i = blockIdx.x * blockDim.x + threadIdx.x; int idx_j = blockIdx.y * blockDim.y + threadIdx.y; //float wireSum = 0; //for(int i = 0; i < M; i++) //loop through trajetory points //{ // wireSum += closestApproach_d[threadIdx.x + 1 + i]; //Can't use zero //} if(idx_j + idx_i * NxM <= max) wireResiduals_d[idx_i] += closestApproach_d[idx_j + idx_i * NxM]; } #endif // __DMAGNETICFIELDSTEPPER_CU__