#ifndef __DMAGNETICFIELDSTEPPER_CU__ #define __DMAGNETICFIELDSTEPPER_CU__ #include #include using namespace std; #include "cutil_math.h" #include "SimpleException.h" #include "GPUMagneticFieldStepper.h" #define MAX_WIRES_HIT 128 #define MAX_STATE_DELTAS 64 __device__ __constant__ unsigned int N_MASS_HYPOTHESES; __device__ __constant__ unsigned int N_TRAJECTORIES_PER_HYPOTHESIS; __device__ __constant__ unsigned int N_STATE_DELTAS; __device__ __constant__ float STATE_DS[MAX_STATE_DELTAS]; __device__ __constant__ state_delta_t STATE_DELTAS[MAX_STATE_DELTAS]; unsigned int n_mass_hypotheses=0; unsigned int n_trajectories_per_hypothesis=11; unsigned int n_state_deltas=0; // Copy of the number of state deltas in host memory float *state_ds; // holds change in each state parameter for easy calculation of numerical derivative state_delta_t *state_deltas_h = NULL; // Copy of state deltas in host memory //---------------- // GPUInitTracking //---------------- void GPUInitTracking(vector &mass_hypotheses) { /// This sets up the GPU to be used for tracking. It /// includes copying the B-field into the GPU device /// and setting up some info for multiple track swims. /// More specifically: /// Set up the complete set of state_delta_t vectors /// that will be used during tracking. Memory is /// allocated on the device to hold these for the life /// of the program. See GPU_SwimMultiple for details. cudaError_t err; CopyBfieldMapToGPU(); // Number of variations of the starting parameters to // swim. The 11 is from: // 1 for no variation // 5 for each parameter +delta // 5 for each parameter -delta n_mass_hypotheses = mass_hypotheses.size(); n_state_deltas = n_mass_hypotheses*n_trajectories_per_hypothesis; // Make sure there is enough memory allocated to hold all // of the mass hypotheses requested. if(n_state_deltas > MAX_STATE_DELTAS){ cerr << "Too many state variations: " << n_state_deltas << " (max. " << MAX_STATE_DELTAS << ")" << endl; cerr << "This is probably because too many mass hypotheses were specified." << endl; cerr << "Try changing the value of MAX_STATE_DELTAS in GPUMagneticFieldStepper.cu" <Cpx*xdir + sd->Cpy*ydir + sd->Cpz*zdir; pos += sd->Cx*xdir + sd->Cy*ydir; S.q_over_p += sd->Cq_over_p; S.mass = sd->mass; // Get pointer to location to store results trajectory_t *traj = &trajectories[idx]; // Swim using the modified state GPU_Swim(S, traj); } } //---------------- // GPU_Swim //---------------- __device__ void GPU_Swim(state_t &S, trajectory_t *traj) { /// Swim a particle from the given starting position and /// momentum through the magnetic field, storing each /// step in the given trajectory_t structure. // Setup first step to be starting position int &Nsteps = traj->Nsteps; step_t *step = traj->steps; *(state_t*)step = S; step->s = 0.0; // initialize to zero distance along trajectory Nsteps=1; // Use local variables so they go into shared(?) memory step_t mystep; float3 *pos = &mystep.pos; float3 *mom = &mystep.mom; // Copy starting parameters into local variablez *pos = S.pos; *mom = S.mom; mystep.q_over_p = S.q_over_p; mystep.mass = S.mass; // Loop until done for(; Nsteps < traj->max_steps; Nsteps++){ // Update pointers to previous and current step step_t *step_prev = step++; // Take step. Pass pointer to previous step so the // xdir, ydir, zdir, Ro values can be filled in. GPU_Step(mystep, *step_prev); step->pos = *pos; step->mom = *mom; step->q_over_p = mystep.q_over_p; float r2 = pos->x*pos->x + pos->y*pos->y; bool done = false; done |= step->pos.z > 390.0; done |= step->pos.z < 17.0; done |= r2 > 3360.0; done |= Nsteps>350; if(done)break; } } //---------------- // GPU_GetStepSize //---------------- __device__ float GPU_GetStepSize(step_t &S) { /// Return the maximum step size for a particle in /// the given state. /// For now, this just returns 1cm return 1.0; // cm } //---------------- // GPU_Step //---------------- __device__ void GPU_Step(step_t &S, step_t &Sprev) { /// Perform a 4th order Runge-Kutta step to advance /// the particle represented by the given state. This /// makes multiple calls to the GPU_TestStep routine. The /// values in S are updated to reflect the new state. float3 &pos = S.pos; float3 &mom = S.mom; // already normalized // Get step size based on material //float h = GPU_GetStepSize(S); // cm float h = 1; // cm // Euler's Method float3 dpos1;; float3 mom1; float4 B1; GPU_GetBfield(pos, &B1); GPU_TestStep(h, S, B1, &dpos1, &mom1, NULL, 1, &Sprev); float3 delta_pos = dpos1; float3 new_mom = mom1; /* // Do 4th order Runge-Kutta (inspired) float3 dpos1, dpos2, dpos3, dpos4; float3 mom1, mom2, mom3, mom4; float4 B1, B2, B3, B4; GPU_GetBfield(pos, &B1); GPU_TestStep(h, S, B1, &dpos1, &mom1, &B2, 0.5, &Sprev); // field at mid-point 1 GPU_TestStep(h, S, B2, &dpos2, &mom2, &B3, 0.5); // field at mid-point 2 GPU_TestStep(h, S, B3, &dpos3, &mom3, &B4, 1.0); // field at end-point GPU_TestStep(h, S, B4, &dpos4, &mom4); // (no return field needed) float3 delta_pos = (float)one_sixth*(dpos1 + (float)2.0*dpos2 + (float)2.0*dpos3 + dpos4); float3 new_mom = (float)one_sixth*( mom1 + 2.0f* mom2 + 2.0f* mom3 + mom4); */ pos += delta_pos; mom = new_mom; // direction only // Adjust for energy loss (not yet implemented correctly!) float &q_over_p = S.q_over_p; float &mass = S.mass; float delta_E = -0.000001*h; // energy lost this step in GeV float one_over_p = fabs(q_over_p); float mass_over_p = mass * one_over_p; float delta_one_over_p = sqrt(1.0 + mass_over_p*mass_over_p); delta_one_over_p *= q_over_p*one_over_p * delta_E; // sign depends on charge delta_one_over_p *= (float)(q_over_p!=0.0); // keep value identically zero if it came in that way //q_over_p -= delta_one_over_p; // Update distance along path S.s = Sprev.s + h; } //---------------- // GPU_TestStep //---------------- __device__ void GPU_TestStep(float h, step_t &Sstart, float4 &B, float3 *delta_pos, float3 *new_mom, float4 *Bmid, float h_frac, step_t *Sprev) { /// Propagate the parameters given in Sstart forward /// a distance "h" along a helical trajectory determined /// by the field given in "B". The position and momentum /// change to the end of the step are returned in /// delta_pos and delta_mom. In addition, if Bmid is not /// NULL, then the field is calculated at a /// point on the helix at h_frac of the full step size. /// e.g. if h_frac=0, the field at the starting point /// is returned. If h_frac=0.5, the field at the half-way /// point along the helical step is returned. This is done since /// it is quicker to do this here when the natural /// coordinates are already calculated for the given field. float3 &pdir = Sstart.mom; float q_over_p = Sstart.q_over_p; float &Bmag = B.w; // Natural coordinate system of reference trajectory float3 &xdir = Sstart.xdir; float3 &ydir = Sstart.ydir; float3 &zdir = Sstart.zdir; zdir = make_float3(B); // discards w which is used to hold magnitude xdir = normalize(cross(pdir, zdir)); ydir = cross(zdir, xdir); // Theta is angle between momentum vector and B-field // direction. In natural coordinates, p is in y/z // plane. Use dot and cross products to get cosine // and sine respectively. float cos_theta = dot(pdir, zdir); // could be + or - float sin_theta = dot(pdir, ydir); // + definite // Rotation angle corresponding to this helical step. // This is not immediately intuitive (at least to me) // but it is derived by considering: // // R * dphi = h * sin(theta) // // and // // qBr * q*Bmag*R = p_t // // solving the second equation for R in terms of // p*sin(theta) = p_t and inserting into the first // gives the below relation for dphi. float sin_theta_over_R = qBr2p * q_over_p * Bmag; // sign depends only on q // It's possible that the value of q_over_p passed in // is zero or that the magnitude of the field is zero. // In either case, sin_theta_over_R will then be zero // leading to problems below. Check here if this is the // case and if so, shift by a tiny amount to avoid dividing // by zero. (This essentially handles straight-line tracks.) float epsilon = 1.0E-8; sin_theta_over_R += (float)(sin_theta_over_R==0.0)*epsilon; // prevent sin_theta_over_R from being exactly zero float dphi = h * sin_theta_over_R; // should be small float sin_dphi = sinf(dphi); // sign depends only on q for small dphi float cos_dphi = cosf(dphi); // + definite for small dphi // Radius of curvature // The radius of curvature comes from qBr=pt where "pt" // is transverse momentum. float R = sin_theta/sin_theta_over_R; // + or - depending on q // Rotate momentum vector about natural z-axis float &pr = sin_theta; // + definite float px = +pr*sin_dphi; // + or - depending on q float py = +pr*cos_dphi; // should always be + float pz = cos_theta; // z-component doesn't change *new_mom = px*xdir + py*ydir + pz*zdir; // Rotate position about center of helix float dx = R*(1.0-cos_dphi); // + or - depending on q float dy = R*sin_dphi; // + definite (since both terms depend on q) float dz = h*cos_theta; // + or - depending on direction of p relative to B *delta_pos = dx*xdir + dy*ydir + dz*zdir; // Calculate field at fractional step (if specified) if(Bmid != NULL){ dphi *= h_frac; sin_dphi = sin(dphi); cos_dphi = cos(dphi); dx = R*(1.0-cos_dphi); dy = R*sin_dphi; dz *= h_frac; float3 pos = Sstart.pos + dx*xdir + dy*ydir + dz*zdir; GPU_GetBfield(pos, Bmid); } // The Sprev variable is used to optionally point to a step_t // where to save the direction vectors that make up the natural // coordinate system. This is used to store this information for // the *previous* step. This is because the B-field at the previous // step's position is only accessed and the directions calculated // when preparing to step to the next point. if(Sprev != NULL){ Sprev->xdir = xdir; Sprev->ydir = ydir; Sprev->zdir = zdir; Sprev->Ro = fabs(R); } // In priciple, all threads should be in sync here, but in // case they aren't, re-sync them now. //__syncthreads(); } //---------------- // GPU_SwimSingle //---------------- __global__ void GPU_SwimSingle(float q, float mass, float3 start_pos, float3 start_mom, trajectory_t *traj) { /// This is a host-callable kernel that will swim /// a single track. state_t S; float px = start_mom.x; float py = start_mom.y; float pz = start_mom.z; float p = sqrt(px*px + py*py + pz*pz); px /= p; py /= p; pz /= p; S.mom = make_float3(px, py, pz); S.pos = start_pos; S.q_over_p = q/p; S.mass = mass; GPU_Swim(S, traj); } //---------------- // MakeTrajectoryBlock //---------------- TrajectoryBlock* MakeTrajectoryBlock(void) { /// Create a TrajectoryBlock structure, allocating /// both host and device memory to hold the /// given number of trajectory_t structures. Memory /// is also allocated on the device to hold all of /// the trajectory steps. The relevant pointers are /// all set in the structures so that the "d" field /// of the trajectory block can be passed into the /// relevant GPU tracking routines used for processing /// multiple tracks. cudaError_t err; // Allocate memory for the trajectory block structure itself TrajectoryBlock *tb = (TrajectoryBlock*)malloc(sizeof(TrajectoryBlock)); tb->N = n_state_deltas; tb->Nhits = 0; // Allocate enough memory on the device to hold all steps // for all trajectories in the block unsigned int max_steps = 10000.0; unsigned int size_steps = tb->N*max_steps*sizeof(state_t); step_t *steps; err = cudaMalloc(&steps, tb->N*size_steps); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); // Allocate memory on host to hold trajectory_t structures // and fill in fields tb->h = (trajectory_t*)malloc(tb->N*sizeof(trajectory_t)); trajectory_t *traj = tb->h; for(unsigned int i=0; iN; i++, traj++){ traj->Nsteps = 0; traj->max_steps = max_steps; traj->steps = &steps[i*max_steps]; } // Allocate memory on device to hold trajectory_t structures // and copy values from host over. err = cudaMalloc(&tb->d, tb->N*sizeof(trajectory_t)); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaMemcpy(tb->d, tb->h, tb->N*sizeof(trajectory_t), cudaMemcpyHostToDevice); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); // Allocate memory on host for various matrices/arrays tb->Binv_matrix_h = (float*)malloc( sizeof(float) * 5*5 ); tb->B_matrix_h = (float*)malloc( sizeof(float) * 5*5 ); tb->dS_matrix_h = (float*)malloc( sizeof(float) * 5 ); // Allocate memory on device to hold index of closest step, distance // from trajectory to wire and residual of hit int N = MAX_WIRES_HIT; // maximum number of wires hit int M = tb->N; // number of trajectories int NxM = N * M; tb->max_hits = N; err = cudaMalloc((void **) &tb->wires_hit_d, sizeof(int) * N); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaMalloc((void **) &tb->hit_index_d, sizeof(int) * NxM); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaMalloc((void **) &tb->hit_dist_d, sizeof(float) * NxM); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaMalloc((void **) &tb->hit_resi_d, sizeof(float) * NxM); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaMalloc((void **) &tb->chisq_per_dof_d, sizeof(float) * M); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaMalloc((void **) &tb->Binv_matrix_d, sizeof(float) * 5*5); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaMalloc((void **) &tb->B_matrix_d, sizeof(float) * 5*5); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaMalloc((void **) &tb->dSdR_matrix_d, sizeof(float) * N*5); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaMalloc((void **) &tb->dS_matrix_d, sizeof(float) * 5); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); // Create a dedicated cuda stream to use with this TrajectoryBlock. // This will allow multiple TrajectoryBlocks to be filled // simultaneously. err = cudaStreamCreate(&tb->stream); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); // return pointer to TrajectoryBlock return tb; } //---------------- // FreeTrajectoryBlock //---------------- void FreeTrajectoryBlock(TrajectoryBlock *tb) { cudaError_t err; if(!tb)throw SimpleException(string("ERROR: trying to free NULL TrajectoryBlock pointer")); cudaStreamDestroy(tb->stream); err = cudaFree(tb->h->steps); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); free(tb->h); free(tb->Binv_matrix_h); free(tb->B_matrix_h); free(tb->dS_matrix_h); err = cudaFree(tb->d); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaFree(tb->wires_hit_d); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaFree(tb->hit_index_d); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaFree(tb->hit_dist_d); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaFree(tb->hit_resi_d); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaFree(tb->chisq_per_dof_d); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaFree(tb->Binv_matrix_d); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaFree(tb->B_matrix_d); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaFree(tb->dSdR_matrix_d); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); err = cudaFree(tb->dS_matrix_d); if(err != cudaSuccess) throw SimpleException(string("CUDA ERROR: ") + cudaGetErrorString(err)); free(tb); } //---------------- // Swim //---------------- void Swim(float q, float mass, float3 start_pos, float3 start_mom, trajectory_t *&traj) { /// This is primarily intended for testing. It is a host resident /// routine that will call the device swimmer, copying the entire /// trajectory from the device back to the host. If the given /// traj pointer reference has a value of NULL, then a trajectory /// structure will be allocated along with the memory to hold the /// steps. The caller may either destroy this, or recycle it on /// a subsequent call. // Allocate memory on host for trajectory if(traj == NULL){ traj = (trajectory_t*)malloc(sizeof(trajectory_t)); traj->Nsteps = 0; traj->max_steps = 0; traj->steps = NULL; } if(traj->steps==NULL){ traj->max_steps = 10; traj->steps = (step_t*)malloc(sizeof(step_t)*traj->max_steps); } // Allocate memory on device for trajectory trajectory_t *d_traj; step_t *d_steps; cudaMalloc(&d_traj, sizeof(trajectory_t)); cudaMalloc(&d_steps, sizeof(step_t)*traj->max_steps); // Make a copy of the host-resident trajectory_t structure so we can // replace the pointer to "steps" with the device resident steps. trajectory_t tmp = *traj; tmp.steps = d_steps; // Copy trajectory stucture to device memory cudaMemcpy(d_traj, &tmp, sizeof(trajectory_t), cudaMemcpyHostToDevice); // Call device swimmer GPU_SwimSingle<<<1,1>>>(q, mass, start_pos, start_mom, d_traj); // Copy trajectory stucture back from device memory cudaMemcpy(&tmp, d_traj, sizeof(trajectory_t), cudaMemcpyDeviceToHost); // Copy steps back from device memory traj->Nsteps = tmp.Nsteps; if(traj->Nsteps<1 || traj->Nsteps>traj->max_steps){ throw SimpleException("Bad number of steps returned from GPU_Swim"); } cudaMemcpy(traj->steps, d_steps, sizeof(step_t)*traj->Nsteps, cudaMemcpyDeviceToHost); } #endif // __DMAGNETICFIELDSTEPPER_CU__