#include #include #include #include #include #include using namespace std; #include #include "GPUMagneticFieldStepper.cu" #include "GPUResidualFinder.cu" #include "GPUDebug.cu" #include "GPUErrorCheck.h" TrajectoryBlock *tb = NULL; // for multi-threaded host application, this will need to be changed //---------------- // AdjustState //---------------- void AdjustState( state_t &S, float *dS ) { /// This will take the 5 parameter adjustments from /// the given dS array and use them to modify the state /// parameters in "S". This is called at the end of a /// fit iteration to adjust the state parameters. // Calculate cos(theta) and sin(theta) float3 &mom = S.mom; // already normalized float3 &pos = S.pos; float cos_theta = mom.z; // + or - float sin_theta = sqrt(1.0 - cos_theta*cos_theta); // + definite (0 <= theta <= pi) // Calculate cos(phi) and sin(phi). We need to handle the // special case when sin(theta)==0.0 so that we don't // divide by zero. Add a small epsilon to sin_theta // in the denominator if sin_theta is zero. In // these cases, both mom.x and mom.y should be zero so // both cos_phi and sin_phi will resolve to zero. We // therefore need to make cos_phi=1 in the case that // both cos_phi and sin_phi are zero. float epsilon = (double)(sin_theta==0.0)*1.0E-8; float cos_phi = mom.x/(sin_theta + epsilon); float sin_phi = mom.y/(sin_theta + epsilon); cos_phi += (double)(cos_phi==0.0 && sin_phi==0.0); // make cos_phi 1 if both cos_phi and sin_phi are zero // Natural coordinate system directions come from // rotating about y-axis by theta and then about the // z' axis by phi. float3 xdir = make_float3(cos_phi*cos_theta, sin_phi*cos_theta, (-sin_theta)); float3 ydir = make_float3( (-sin_theta) , cos_phi , 0.0 ); float3 zdir = make_float3(cos_phi*sin_theta, sin_phi*sin_theta, cos_theta ); float &Cpx = dS[0]; float &Cpy = dS[1]; float &Cx = dS[2]; float &Cy = dS[3]; float &Cq_over_p = dS[4]; mom -= Cpx*xdir + Cpy*ydir; pos -= Cx*xdir +Cy*ydir; S.q_over_p -= Cq_over_p; } //---------------- // GPUFitTrack //---------------- int GPUFitTrack(float3 &pos, float3 &mom, float &q_over_p, vector &wires_hit, vector &results) { // Make sure there is at least 1 hit if(wires_hit.size() < 1) return 5; // Allocate TrajectoryBlock if(tb == NULL) tb = MakeTrajectoryBlock(); // Copy wires hit into trajectory block's device memory cudaMemcpy(tb->wires_hit_d, &wires_hit[0], wires_hit.size()*sizeof(int), cudaMemcpyHostToDevice); // Setup some variables int N = wires_hit.size(); // number of wires hit int M = tb->N; // number of trajectories (=N_STATE_DELTAS) state_t S; S.pos = pos; S.mom = mom; S.q_over_p = q_over_p; S.mass = 0; tb->Nhits = N; // Iterate several times to converge fit int Niterations = 0; for(; Niterations<10; Niterations++){ // limit number of iterations to 10 // Check that state parameters do not contain any nan values if(!finite(length(S.mom))) return 1; if(!finite(length(S.pos))) return 2; if(!finite(S.q_over_p)) return 3; if(fabs(S.q_over_p) > 10.0) return 3; // Swim all trajectories (always give it a thread block size of 32 to match the warp size) int Nwarps = (M+31)/32; // find number of warps needed for M trajectories (assuming 32 threads per warp) int Nthreads = Nwarps*32; //char str[256]; //sprintf(str, "i=%d Nthreads=%d", Niterations, Nthreads); //printStateParams(S, str); //N blocks to find closest trajectory point to each wire for all trajectories GPU_SwimMultiple<<< N , Nthreads, 0, tb->stream>>>(S, tb->d); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaStreamSynchronize(tb->stream) ); #if debug printHostTrajectories(tb); #endif // Find the step on each trajectory closest to each wire hit (use one thread block for each wire hit) GPU_FindClosestTrajectoryPoint <<< N, Nthreads, 0, tb->stream >>>(N, M, tb->d, tb->wires_hit_d, tb->hit_index_d); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaStreamSynchronize(tb->stream) ); #if debug printNearestTrajectoryPoint(tb); #endif // Calculate the distance of each trajectory to each wire and // normalized residual (residual divided by uncertainty) // (use one thread block for each wire hit) GPU_CalcResiduals <<< N, Nthreads, 0, tb->stream >>>(N, M, tb->d, tb->wires_hit_d, tb->hit_index_d, tb->hit_dist_d, tb->hit_resi_d); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaStreamSynchronize(tb->stream) ); #if debug printResiduals(tb); #endif // Calculate Binv for first mass hypothesis (doing it additional hypotheses // will require some refactoring of upstream code) dim3 thrDim(5,5); GPU_CalcMatrix_Binv <<< n_mass_hypotheses, thrDim,0, tb->stream >>>(N, M, tb->d, tb->wires_hit_d, tb->hit_resi_d, tb->Binv_matrix_d); cudaMemcpyAsync(tb->Binv_matrix_h, tb->Binv_matrix_d, 25 * sizeof(float), cudaMemcpyDeviceToHost, tb->stream); cudaStreamSynchronize(tb->stream); // wait for copy to complete DMatrix5x5 Binv; for(int i=0; i<5; i++){ for(int j=0; j<5; j++){ Binv(i,j) = tb->Binv_matrix_h[i+j*5]; } } // Invert Binv in order to get "B" and copy into device memory DMatrix5x5 B(Binv.InvertSym()); bool OK = true; for(int i=0; i<5; i++){ for(int j=0; j<5; j++){ tb->B_matrix_h[i+j*5] = B(i,j); OK |= finite(B(i,j)); } } if(!OK) return 4; #if debug cout << "Printing Binv matrix: " << endl; printMatrix5x5(Binv); // print out Binv matrix cout << "Printing B matrix: " << endl; printMatrix5x5(B); // print out B matrix DMatrix5x5 tmp = B*Binv; cout << "Printing B*Binv matrix: " << endl; printMatrix5x5(tmp); // print out B*Binv matrix #endif cudaMemcpyAsync(tb->B_matrix_d, tb->B_matrix_h, 25 * sizeof(float), cudaMemcpyHostToDevice, tb->stream); cudaStreamSynchronize(tb->stream); // wait for copy to complete // Calculate dSdR matrix thrDim.x = N; thrDim.y = 5; GPU_Calc_dSdR <<< n_mass_hypotheses, thrDim,0, tb->stream >>>(N, M, tb->hit_resi_d, tb->B_matrix_d, tb->dSdR_matrix_d); #if debug printdSdRMatrix(tb); #endif // Calculate change in state parameters thrDim.x = 1; thrDim.y = 5; GPU_Calc_dS <<< n_mass_hypotheses, thrDim,0, tb->stream >>>(N, M, tb->hit_resi_d, tb->dSdR_matrix_d, tb->dS_matrix_d); cudaMemcpyAsync(tb->dS_matrix_h, tb->dS_matrix_d, 5 * sizeof(float), cudaMemcpyDeviceToHost, tb->stream); cudaStreamSynchronize(tb->stream); // wait for copy to complete #if debug printChangeInStateParams(tb); printStateParams(S, "before"); #endif // Modify state parameters AdjustState(S, tb->dS_matrix_h ); #if debug printStateParams(S, "after"); #endif } if(!finite(length(S.mom))) return 1; if(!finite(length(S.pos))) return 2; if(!finite(S.q_over_p)) return 3; if(S.q_over_p >= 10) return 3; // Copy results back into references so caller can access them pos = S.pos; mom = S.mom; q_over_p = S.q_over_p; return 0; }