/* !-------------------------------------------------------------------------! ! ! ! N A S P A R A L L E L B E N C H M A R K S 3.0 ! ! ! ! J A V A V E R S I O N ! ! ! ! Y S o l v e r ! ! ! !-------------------------------------------------------------------------! ! ! ! YSolver implements thread for y_solve subroutine ! ! of the BT benchmark. ! ! ! ! Permission to use, copy, distribute and modify this software ! ! for any purpose with or without fee is hereby granted. We ! ! request, however, that all derived work reference the NAS ! ! Parallel Benchmarks 3.0. This software is provided "as is" ! ! without express or implied warranty. ! ! ! ! Information on NPB 3.0, including the Technical Report NAS-02-008 ! ! "Implementation of the NAS Parallel Benchmarks in Java", ! ! original specifications, source code, results and information ! ! on how to submit new results, is available at: ! ! ! ! http://www.nas.nasa.gov/Software/NPB/ ! ! ! ! Send comments or suggestions to npb@nas.nasa.gov ! ! ! ! NAS Parallel Benchmarks Group ! ! NASA Ames Research Center ! ! Mail Stop: T27A-1 ! ! Moffett Field, CA 94035-1000 ! ! ! ! E-mail: npb@nas.nasa.gov ! ! Fax: (650) 604-3957 ! ! ! !-------------------------------------------------------------------------! ! Translation to Java and to MultiThreaded Code: ! ! Michael A. Frumkin ! ! Mathew Schultz ! !-------------------------------------------------------------------------! */ package NPB_JAV.BTThreads; import NPB_JAV.BT; public class YSolver extends BTBase{ public int id; public boolean done = true; //private arrays and data double fjac[] = null; double njac[] = null; double lhs[] = null; double tmp1; double tmp2; double tmp3; int lower_bound; int upper_bound; public YSolver(BT bt,int low, int high){ Init(bt); lower_bound=low; upper_bound=high; fjac = new double[5*5*(problem_size+1)]; njac = new double[5*5*(problem_size+1)]; lhs = new double[5*5*3*(problem_size+1)]; setPriority(Thread.MAX_PRIORITY); setDaemon(true); master=bt; } void Init(BT bt){ //initialize shared data IMAX=bt.IMAX; JMAX=bt.JMAX; KMAX=bt.KMAX; problem_size=bt.problem_size; grid_points=bt.grid_points; niter_default=bt.niter_default; dt_default=bt.dt_default; u=bt.u; rhs=bt.rhs; forcing=bt.forcing; cv=bt.cv; q=bt.q; cuf=bt.cuf; isize2=bt.isize2; jsize2=bt.jsize2; ksize2=bt.ksize2; us=bt.us; vs=bt.vs; ws=bt.ws; qs=bt.qs; rho_i=bt.rho_i; square=bt.square; jsize1=bt.jsize1; ksize1=bt.ksize1; ue=bt.ue; buf=bt.buf; jsize3=bt.jsize3; } public void run(){ for(;;){ synchronized(this){ while(done==true){ try{ wait(); synchronized(master){ master.notify();} }catch(InterruptedException ie){} } step(); synchronized(master){done=true;master.notify();} } } } public void step(){ int i, j, k, m, n, jsize; //--------------------------------------------------------------------- // This function computes the left hand side for the three y-factors //--------------------------------------------------------------------- jsize = grid_points[1]-1; //--------------------------------------------------------------------- // Compute the indices for storing the tri-diagonal matrix; // determine a (labeled f) and n jacobians for cell c //--------------------------------------------------------------------- for(k=lower_bound;k<=upper_bound;k++){ for(i=1;i<=grid_points[0]-2;i++){ for(j=0;j<=jsize;j++){ tmp1 = rho_i[i+j*jsize1+k*ksize1]; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; fjac[0+0*isize4+j*jsize4] = 0.0; fjac[0+1*isize4+j*jsize4] = 0.0; fjac[0+2*isize4+j*jsize4] = 1.0; fjac[0+3*isize4+j*jsize4] = 0.0; fjac[0+4*isize4+j*jsize4] = 0.0; fjac[1+0*isize4+j*jsize4] = - ( u[1+i*isize2+j*jsize2+k*ksize2]*u[2+i*isize2+j*jsize2+k*ksize2] ) * tmp2; fjac[1+1*isize4+j*jsize4] = u[2+i*isize2+j*jsize2+k*ksize2] * tmp1; fjac[1+2*isize4+j*jsize4] = u[1+i*isize2+j*jsize2+k*ksize2] * tmp1; fjac[1+3*isize4+j*jsize4] = 0.0; fjac[1+4*isize4+j*jsize4] = 0.0; fjac[2+0*isize4+j*jsize4] = - ( u[2+i*isize2+j*jsize2+k*ksize2]*u[2+i*isize2+j*jsize2+k*ksize2]*tmp2) + c2 * qs[i+j*jsize1+k*ksize1]; fjac[2+1*isize4+j*jsize4] = - c2 * u[1+i*isize2+j*jsize2+k*ksize2] * tmp1; fjac[2+2*isize4+j*jsize4] = ( 2.0 - c2 ) * u[2+i*isize2+j*jsize2+k*ksize2] * tmp1; fjac[2+3*isize4+j*jsize4] = - c2 * u[3+i*isize2+j*jsize2+k*ksize2] * tmp1 ; fjac[2+4*isize4+j*jsize4] = c2; fjac[3+0*isize4+j*jsize4] = - ( u[2+i*isize2+j*jsize2+k*ksize2]*u[3+i*isize2+j*jsize2+k*ksize2] ) * tmp2; fjac[3+1*isize4+j*jsize4] = 0.0; fjac[3+2*isize4+j*jsize4] = u[3+i*isize2+j*jsize2+k*ksize2] * tmp1; fjac[3+3*isize4+j*jsize4] = u[2+i*isize2+j*jsize2+k*ksize2] * tmp1; fjac[3+4*isize4+j*jsize4] = 0.0; fjac[4+0*isize4+j*jsize4] = ( c2 * 2.0 * square[i+j*jsize1+k*ksize1] - c1 * u[4+i*isize2+j*jsize2+k*ksize2] ) * u[2+i*isize2+j*jsize2+k*ksize2] * tmp2; fjac[4+1*isize4+j*jsize4] = - c2 * u[1+i*isize2+j*jsize2+k*ksize2]*u[2+i*isize2+j*jsize2+k*ksize2] * tmp2; fjac[4+2*isize4+j*jsize4] = c1 * u[4+i*isize2+j*jsize2+k*ksize2] * tmp1 - c2 * ( qs[i+j*jsize1+k*ksize1] + u[2+i*isize2+j*jsize2+k*ksize2]*u[2+i*isize2+j*jsize2+k*ksize2] * tmp2 ); fjac[4+3*isize4+j*jsize4] = - c2 * ( u[2+i*isize2+j*jsize2+k*ksize2]*u[3+i*isize2+j*jsize2+k*ksize2] ) * tmp2; fjac[4+4*isize4+j*jsize4] = c1 * u[2+i*isize2+j*jsize2+k*ksize2] * tmp1 ; njac[0+0*isize4+j*jsize4] = 0.0; njac[0+1*isize4+j*jsize4] = 0.0; njac[0+2*isize4+j*jsize4] = 0.0; njac[0+3*isize4+j*jsize4] = 0.0; njac[0+4*isize4+j*jsize4] = 0.0; njac[1+0*isize4+j*jsize4] = - c3c4 * tmp2 * u[1+i*isize2+j*jsize2+k*ksize2]; njac[1+1*isize4+j*jsize4] = c3c4 * tmp1; njac[1+2*isize4+j*jsize4] = 0.0; njac[1+3*isize4+j*jsize4] = 0.0; njac[1+4*isize4+j*jsize4] = 0.0; njac[2+0*isize4+j*jsize4] = - con43 * c3c4 * tmp2 * u[2+i*isize2+j*jsize2+k*ksize2]; njac[2+1*isize4+j*jsize4] = 0.0; njac[2+2*isize4+j*jsize4] = con43 * c3c4 * tmp1; njac[2+3*isize4+j*jsize4] = 0.0; njac[2+4*isize4+j*jsize4] = 0.0; njac[3+0*isize4+j*jsize4] = - c3c4 * tmp2 * u[3+i*isize2+j*jsize2+k*ksize2]; njac[3+1*isize4+j*jsize4] = 0.0; njac[3+2*isize4+j*jsize4] = 0.0; njac[3+3*isize4+j*jsize4] = c3c4 * tmp1; njac[3+4*isize4+j*jsize4] = 0.0; njac[4+0*isize4+j*jsize4] = - ( c3c4 - c1345 ) * tmp3 * (Math.pow(u[1+i*isize2+j*jsize2+k*ksize2],2)) - ( con43 * c3c4 - c1345 ) * tmp3 * (Math.pow(u[2+i*isize2+j*jsize2+k*ksize2],2)) - ( c3c4 - c1345 ) * tmp3 * (Math.pow(u[3+i*isize2+j*jsize2+k*ksize2],2)) - c1345 * tmp2 * u[4+i*isize2+j*jsize2+k*ksize2]; njac[4+1*isize4+j*jsize4] = ( c3c4 - c1345 ) * tmp2 * u[1+i*isize2+j*jsize2+k*ksize2]; njac[4+2*isize4+j*jsize4] = ( con43 * c3c4 - c1345 ) * tmp2 * u[2+i*isize2+j*jsize2+k*ksize2]; njac[4+3*isize4+j*jsize4] = ( c3c4 - c1345 ) * tmp2 * u[3+i*isize2+j*jsize2+k*ksize2]; njac[4+4*isize4+j*jsize4] = ( c1345 ) * tmp1; } //--------------------------------------------------------------------- // now joacobians set, so form left hand side in y direction //--------------------------------------------------------------------- lhsinit(lhs, jsize); for(j=1;j<=jsize-1;j++){ tmp1 = dt * ty1; tmp2 = dt * ty2; lhs[0+0*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[0+0*isize4+(j-1)*jsize4] - tmp1 * njac[0+0*isize4+(j-1)*jsize4] - tmp1 * dy1 ; lhs[0+1*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[0+1*isize4+(j-1)*jsize4] - tmp1 * njac[0+1*isize4+(j-1)*jsize4]; lhs[0+2*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[0+2*isize4+(j-1)*jsize4] - tmp1 * njac[0+2*isize4+(j-1)*jsize4]; lhs[0+3*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[0+3*isize4+(j-1)*jsize4] - tmp1 * njac[0+3*isize4+(j-1)*jsize4]; lhs[0+4*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[0+4*isize4+(j-1)*jsize4] - tmp1 * njac[0+4*isize4+(j-1)*jsize4]; lhs[1+0*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[1+0*isize4+(j-1)*jsize4] - tmp1 * njac[1+0*isize4+(j-1)*jsize4]; lhs[1+1*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[1+1*isize4+(j-1)*jsize4] - tmp1 * njac[1+1*isize4+(j-1)*jsize4] - tmp1 * dy2; lhs[1+2*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[1+2*isize4+(j-1)*jsize4] - tmp1 * njac[1+2*isize4+(j-1)*jsize4]; lhs[1+3*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[1+3*isize4+(j-1)*jsize4] - tmp1 * njac[1+3*isize4+(j-1)*jsize4]; lhs[1+4*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[1+4*isize4+(j-1)*jsize4] - tmp1 * njac[1+4*isize4+(j-1)*jsize4]; lhs[2+0*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[2+0*isize4+(j-1)*jsize4] - tmp1 * njac[2+0*isize4+(j-1)*jsize4]; lhs[2+1*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[2+1*isize4+(j-1)*jsize4] - tmp1 * njac[2+1*isize4+(j-1)*jsize4]; lhs[2+2*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[2+2*isize4+(j-1)*jsize4] - tmp1 * njac[2+2*isize4+(j-1)*jsize4] - tmp1 * dy3 ; lhs[2+3*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[2+3*isize4+(j-1)*jsize4] - tmp1 * njac[2+3*isize4+(j-1)*jsize4]; lhs[2+4*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[2+4*isize4+(j-1)*jsize4] - tmp1 * njac[2+4*isize4+(j-1)*jsize4]; lhs[3+0*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[3+0*isize4+(j-1)*jsize4] - tmp1 * njac[3+0*isize4+(j-1)*jsize4]; lhs[3+1*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[3+1*isize4+(j-1)*jsize4] - tmp1 * njac[3+1*isize4+(j-1)*jsize4]; lhs[3+2*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[3+2*isize4+(j-1)*jsize4] - tmp1 * njac[3+2*isize4+(j-1)*jsize4]; lhs[3+3*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[3+3*isize4+(j-1)*jsize4] - tmp1 * njac[3+3*isize4+(j-1)*jsize4] - tmp1 * dy4; lhs[3+4*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[3+4*isize4+(j-1)*jsize4] - tmp1 * njac[3+4*isize4+(j-1)*jsize4]; lhs[4+0*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[4+0*isize4+(j-1)*jsize4] - tmp1 * njac[4+0*isize4+(j-1)*jsize4]; lhs[4+1*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[4+1*isize4+(j-1)*jsize4] - tmp1 * njac[4+1*isize4+(j-1)*jsize4]; lhs[4+2*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[4+2*isize4+(j-1)*jsize4] - tmp1 * njac[4+2*isize4+(j-1)*jsize4]; lhs[4+3*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[4+3*isize4+(j-1)*jsize4] - tmp1 * njac[4+3*isize4+(j-1)*jsize4]; lhs[4+4*isize4+aa*jsize4+j*ksize4] = - tmp2 * fjac[4+4*isize4+(j-1)*jsize4] - tmp1 * njac[4+4*isize4+(j-1)*jsize4] - tmp1 * dy5; lhs[0+0*isize4+bb*jsize4+j*ksize4] = 1.0 + tmp1 * 2.0 * njac[0+0*isize4+j*jsize4] + tmp1 * 2.0 * dy1; lhs[0+1*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[0+1*isize4+j*jsize4]; lhs[0+2*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[0+2*isize4+j*jsize4]; lhs[0+3*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[0+3*isize4+j*jsize4]; lhs[0+4*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[0+4*isize4+j*jsize4]; lhs[1+0*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[1+0*isize4+j*jsize4]; lhs[1+1*isize4+bb*jsize4+j*ksize4] = 1.0 + tmp1 * 2.0 * njac[1+1*isize4+j*jsize4] + tmp1 * 2.0 * dy2; lhs[1+2*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[1+2*isize4+j*jsize4]; lhs[1+3*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[1+3*isize4+j*jsize4]; lhs[1+4*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[1+4*isize4+j*jsize4]; lhs[2+0*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[2+0*isize4+j*jsize4]; lhs[2+1*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[2+1*isize4+j*jsize4]; lhs[2+2*isize4+bb*jsize4+j*ksize4] = 1.0 + tmp1 * 2.0 * njac[2+2*isize4+j*jsize4] + tmp1 * 2.0 * dy3; lhs[2+3*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[2+3*isize4+j*jsize4]; lhs[2+4*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[2+4*isize4+j*jsize4]; lhs[3+0*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[3+0*isize4+j*jsize4]; lhs[3+1*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[3+1*isize4+j*jsize4]; lhs[3+2*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[3+2*isize4+j*jsize4]; lhs[3+3*isize4+bb*jsize4+j*ksize4] = 1.0 + tmp1 * 2.0 * njac[3+3*isize4+j*jsize4] + tmp1 * 2.0 * dy4; lhs[3+4*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[3+4*isize4+j*jsize4]; lhs[4+0*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[4+0*isize4+j*jsize4]; lhs[4+1*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[4+1*isize4+j*jsize4]; lhs[4+2*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[4+2*isize4+j*jsize4]; lhs[4+3*isize4+bb*jsize4+j*ksize4] = tmp1 * 2.0 * njac[4+3*isize4+j*jsize4]; lhs[4+4*isize4+bb*jsize4+j*ksize4] = 1.0 + tmp1 * 2.0 * njac[4+4*isize4+j*jsize4] + tmp1 * 2.0 * dy5; lhs[0+0*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[0+0*isize4+(j+1)*jsize4] - tmp1 * njac[0+0*isize4+(j+1)*jsize4] - tmp1 * dy1; lhs[0+1*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[0+1*isize4+(j+1)*jsize4] - tmp1 * njac[0+1*isize4+(j+1)*jsize4]; lhs[0+2*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[0+2*isize4+(j+1)*jsize4] - tmp1 * njac[0+2*isize4+(j+1)*jsize4]; lhs[0+3*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[0+3*isize4+(j+1)*jsize4] - tmp1 * njac[0+3*isize4+(j+1)*jsize4]; lhs[0+4*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[0+4*isize4+(j+1)*jsize4] - tmp1 * njac[0+4*isize4+(j+1)*jsize4]; lhs[1+0*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[1+0*isize4+(j+1)*jsize4] - tmp1 * njac[1+0*isize4+(j+1)*jsize4]; lhs[1+1*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[1+1*isize4+(j+1)*jsize4] - tmp1 * njac[1+1*isize4+(j+1)*jsize4] - tmp1 * dy2; lhs[1+2*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[1+2*isize4+(j+1)*jsize4] - tmp1 * njac[1+2*isize4+(j+1)*jsize4]; lhs[1+3*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[1+3*isize4+(j+1)*jsize4] - tmp1 * njac[1+3*isize4+(j+1)*jsize4]; lhs[1+4*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[1+4*isize4+(j+1)*jsize4] - tmp1 * njac[1+4*isize4+(j+1)*jsize4]; lhs[2+0*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[2+0*isize4+(j+1)*jsize4] - tmp1 * njac[2+0*isize4+(j+1)*jsize4]; lhs[2+1*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[2+1*isize4+(j+1)*jsize4] - tmp1 * njac[2+1*isize4+(j+1)*jsize4]; lhs[2+2*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[2+2*isize4+(j+1)*jsize4] - tmp1 * njac[2+2*isize4+(j+1)*jsize4] - tmp1 * dy3; lhs[2+3*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[2+3*isize4+(j+1)*jsize4] - tmp1 * njac[2+3*isize4+(j+1)*jsize4]; lhs[2+4*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[2+4*isize4+(j+1)*jsize4] - tmp1 * njac[2+4*isize4+(j+1)*jsize4]; lhs[3+0*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[3+0*isize4+(j+1)*jsize4] - tmp1 * njac[3+0*isize4+(j+1)*jsize4]; lhs[3+1*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[3+1*isize4+(j+1)*jsize4] - tmp1 * njac[3+1*isize4+(j+1)*jsize4]; lhs[3+2*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[3+2*isize4+(j+1)*jsize4] - tmp1 * njac[3+2*isize4+(j+1)*jsize4]; lhs[3+3*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[3+3*isize4+(j+1)*jsize4] - tmp1 * njac[3+3*isize4+(j+1)*jsize4] - tmp1 * dy4; lhs[3+4*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[3+4*isize4+(j+1)*jsize4] - tmp1 * njac[3+4*isize4+(j+1)*jsize4]; lhs[4+0*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[4+0*isize4+(j+1)*jsize4] - tmp1 * njac[4+0*isize4+(j+1)*jsize4]; lhs[4+1*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[4+1*isize4+(j+1)*jsize4] - tmp1 * njac[4+1*isize4+(j+1)*jsize4]; lhs[4+2*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[4+2*isize4+(j+1)*jsize4] - tmp1 * njac[4+2*isize4+(j+1)*jsize4]; lhs[4+3*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[4+3*isize4+(j+1)*jsize4] - tmp1 * njac[4+3*isize4+(j+1)*jsize4]; lhs[4+4*isize4+cc*jsize4+j*ksize4] = tmp2 * fjac[4+4*isize4+(j+1)*jsize4] - tmp1 * njac[4+4*isize4+(j+1)*jsize4] - tmp1 * dy5; } //--------------------------------------------------------------------- // performs guaussian elimination on this cell. // // assumes that unpacking routines for non-first cells // preload C' and rhs' from previous cell. // // assumed send happens outside this routine, but that // c'(JMAX) and rhs'(JMAX) will be sent to next cell //--------------------------------------------------------------------- //--------------------------------------------------------------------- // multiply c(i,0,k) by b_inverse and copy back to c // multiply rhs(0) by b_inverse(0) and copy to rhs //--------------------------------------------------------------------- binvcrhs( lhs,0+0*isize4+bb*jsize4+0*ksize4, lhs,0+0*isize4+cc*jsize4+0*ksize4, rhs,0+i*isize2+0*jsize2+k*ksize2 ); //--------------------------------------------------------------------- // begin inner most do loop // do all the elements of the cell unless last //--------------------------------------------------------------------- for(j=1;j<=jsize-1;j++){ //--------------------------------------------------------------------- // subtract A*lhs_vector(j-1) from lhs_vector(j) // // rhs(j) = rhs(j) - A*rhs(j-1) //--------------------------------------------------------------------- matvec_sub(lhs,0+0*isize4+aa*jsize4+j*ksize4, rhs,0+i*isize2+(j-1)*jsize2+k*ksize2, rhs,0+i*isize2+j*jsize2+k*ksize2); //--------------------------------------------------------------------- // B(j) = B(j) - C(j-1)*A(j) //--------------------------------------------------------------------- matmul_sub(lhs,0+0*isize4+aa*jsize4+j*ksize4, lhs,0+0*isize4+cc*jsize4+(j-1)*ksize4, lhs,0+0*isize4+bb*jsize4+j*ksize4); //--------------------------------------------------------------------- // multiply c(i,j,k) by b_inverse and copy back to c // multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs //--------------------------------------------------------------------- binvcrhs( lhs,0+0*isize4+bb*jsize4+j*ksize4, lhs,0+0*isize4+cc*jsize4+j*ksize4, rhs,0+i*isize2+j*jsize2+k*ksize2 ); } //--------------------------------------------------------------------- // rhs(jsize) = rhs(jsize) - A*rhs(jsize-1) //--------------------------------------------------------------------- matvec_sub(lhs,0+0*isize4+aa*jsize4+jsize*ksize4, rhs,0+i*isize2+(jsize-1)*jsize2+k*ksize2, rhs,0+i*isize2+jsize*jsize2+k*ksize2); //--------------------------------------------------------------------- // B(jsize) = B(jsize) - C(jsize-1)*A(jsize) // matmul_sub(aa,i,jsize,k,c, // $ cc,i,jsize-1,k,c,bb,i,jsize,k) //--------------------------------------------------------------------- matmul_sub(lhs,0+0*isize4+aa*jsize4+jsize*ksize4, lhs,0+0*isize4+cc*jsize4+(jsize-1)*ksize4, lhs,0+0*isize4+bb*jsize4+jsize*ksize4); //--------------------------------------------------------------------- // multiply rhs(jsize) by b_inverse(jsize) and copy to rhs //--------------------------------------------------------------------- binvrhs( lhs,0+0*isize4+bb*jsize4+jsize*ksize4, rhs,0+i*isize2+jsize*jsize2+k*ksize2 ); //--------------------------------------------------------------------- // back solve: if last cell, then generate U(jsize)=rhs(jsize) // else assume U(jsize) is loaded in un pack backsub_info // so just use it // after u(jstart) will be sent to next cell //--------------------------------------------------------------------- for(j=jsize-1;j>=0;j--){ for(m=0;m<=BLOCK_SIZE-1;m++){ for(n=0;n<=BLOCK_SIZE-1;n++){ rhs[m+i*isize2+j*jsize2+k*ksize2] -= lhs[m+n*isize4+cc*jsize4+j*ksize4]*rhs[n+i*isize2+(j+1)*jsize2+k*ksize2]; } } } } } } };