/* !-------------------------------------------------------------------------! ! ! ! N A S P A R A L L E L B E N C H M A R K S 3.0 ! ! ! ! J A V A V E R S I O N ! ! ! ! B T ! ! ! !-------------------------------------------------------------------------! ! ! ! This benchmark is a serial/multithreaded version of the BT code. ! ! ! ! Permission to use, copy, distribute and modify this software ! ! for any purpose with or without fee is hereby granted. We ! ! request, however, that all derived work reference the NAS ! ! Parallel Benchmarks 3.0. This software is provided "as is" ! ! without express or implied warranty. ! ! ! ! Information on NPB 3.0, including the Technical Report NAS-02-008 ! ! "Implementation of the NAS Parallel Benchmarks in Java", ! ! original specifications, source code, results and information ! ! on how to submit new results, is available at: ! ! ! ! http://www.nas.nasa.gov/Software/NPB/ ! ! ! ! Send comments or suggestions to npb@nas.nasa.gov ! ! ! ! NAS Parallel Benchmarks Group ! ! NASA Ames Research Center ! ! Mail Stop: T27A-1 ! ! Moffett Field, CA 94035-1000 ! ! ! ! E-mail: npb@nas.nasa.gov ! ! Fax: (650) 604-3957 ! ! ! !-------------------------------------------------------------------------! ! ! ! Authors: R. Van der Wijngaart ! ! T. Harris ! ! M. Yarrow ! ! Modified for PBN (Programming Baseline for NPB): ! ! H. Jin ! ! Translation to Java and to MultiThreaded Code ! ! M. Frumkin ! ! M. Schultz ! !-------------------------------------------------------------------------! */ package NPB_JAV; import NPB_JAV.BTThreads.*; import NPB_JAV.BMInOut.*; import java.io.*; import java.text.*; public class BT extends BTBase{ public int bid=-1; public BMResults results; public boolean serial=true; double fjac[]; double njac[]; double lhs[]; double tmp1; double tmp2; double tmp3; public BT(char clss, int threads,boolean ser){ super(clss, threads); serial=ser; fjac = new double[5*5*(problem_size+1)]; njac = new double[5*5*(problem_size+1)]; lhs = new double[5*5*3*(problem_size+1)]; } public static void main(String argv[]){ BT bt = null; BMArgs.ParseCmdLineArgs(argv,BMName); char CLSS=BMArgs.CLASS; int np=BMArgs.num_threads; boolean serial=BMArgs.serial; try{ bt = new BT(CLSS,np,serial); }catch(OutOfMemoryError e){ BMArgs.outOfMemoryMessage(); System.exit(0); } bt.runBenchMark(); } public void run(){runBenchMark();} public void runBenchMark(){ BMArgs.Banner(BMName,CLASS,serial,num_threads); int numTimers=t_last+1; String t_names[] = new String[numTimers]; double trecs[] = new double[numTimers]; setTimers(t_names); int niter=getInputPars(); set_constants(); initialize(); exact_rhs(); if(!serial) setupThreads(this); //--------------------------------------------------------------------- // do one time step to touch all code, and reinitialize //--------------------------------------------------------------------- if(serial) adi_serial(); else adi(); initialize(); timer.resetAllTimers(); timer.start(t_total); for(int step=1;step<=niter;step++){ //niter if ( step % 20 == 0 || step == 1||step==niter) { System.out.println("Time step "+step); } if(serial) adi_serial(); else adi(); } timer.stop(t_total); int verified = verify(niter); double time = timer.readTimer(t_total); results=new BMResults(BMName, CLASS, grid_points[0], grid_points[1], grid_points[2], niter, time, getMFLOPS(time,niter), "floating point", verified, serial, num_threads, bid); results.print(); if(timeron) printTimers(t_names,trecs,time); } public double getMFLOPS(double total_time,int niter){ double mflops = 0.0; if( total_time > 0 ){ double n3 = grid_points[0]*grid_points[1]*grid_points[2]; double navg = (grid_points[0]+grid_points[1]+grid_points[2])/3.0; mflops = 3478.8*n3-17655.7*Math.pow(navg,2)+28023.7*navg; mflops *= niter / (total_time*1000000.0); } return mflops; } public void adi_serial(){ compute_rhs(); x_solve(); y_solve(); z_solve(); add(); } public void adi(){ if(timeron)timer.start(t_rhs); doRHS(); if(timeron)timer.start(t_rhsx); doRHS(); if(timeron)timer.stop(t_rhsx); if(timeron)timer.start(t_rhsy); doRHS(); if(timeron)timer.stop(t_rhsy); if(timeron)timer.start(t_rhsz); doRHS(); if(timeron)timer.stop(t_rhsz); doRHS(); if(timeron)timer.stop(t_rhs); if(timeron)timer.start(t_xsolve); synchronized(this){ for(int m=0;m total "); System.out.print("sub-rhs "); System.out.print(fmt.format(t)); System.out.print( " (" ); System.out.print(fmt.format(t*100/tmax)); System.out.println("%)"); t = trecs[t_rhs] - trecs[t_rhsx] + trecs[t_rhsy] + trecs[t_rhsz]; System.out.print(" --> total "); System.out.print("rest-rhs "); System.out.print(fmt.format(t)); System.out.print( " (" ); System.out.print(fmt.format(t*100/tmax)); System.out.println("%)"); }else if(i==t_zsolve) { t = trecs[t_zsolve] - trecs[t_rdis1] - trecs[t_rdis2]; System.out.print(" --> total "); System.out.print("sub-zsol "); System.out.print(fmt.format(t)); System.out.print( " " ); System.out.println(fmt.format(t*100/tmax)); System.out.println(); }else if(i==t_rdis2) { t = trecs[t_rdis1] + trecs[t_rdis2]; System.out.print(" --> total "); System.out.print("redist "); System.out.print(fmt.format(t)); System.out.print( " " ); System.out.println(fmt.format(t*100/tmax)); } } } public int getInputPars(){ int niter=0; File f2 = new File("inputbt.data"); if ( f2.exists() ){ try{ FileInputStream fis = new FileInputStream(f2); DataInputStream datafile = new DataInputStream(fis); System.out.println("Reading from input file inputbt.data"); niter = datafile.readInt(); dt = datafile.readDouble(); grid_points[0] = datafile.readInt(); grid_points[1] = datafile.readInt(); grid_points[2] = datafile.readInt(); fis.close(); }catch(Exception e){ System.err.println("exception caught!"); } }else{ System.out.println("No input file inputbt.data, Using compiled defaults"); niter = niter_default; dt = dt_default; grid_points[0] = problem_size; grid_points[1] = problem_size; grid_points[2] = problem_size; } System.out.println( "Size: "+grid_points[0] +" X "+grid_points[1] +" X "+grid_points[2]); if ( (grid_points[0] > IMAX) || (grid_points[1] > JMAX) || (grid_points[2] > KMAX) ) { System.out.println("Problem size too big for array"); System.exit(0); } System.out.println("Iterations: "+niter+" dt: "+dt); return niter; } public void setTimers(String t_names[]){ File f1 = new File("timer.flag"); timeron = false; if( f1.exists() ){ timeron = true; t_names[t_total] = new String("total "); t_names[t_rhsx] = new String("rhsx "); t_names[t_rhsy] = new String("rhsy "); t_names[t_rhsz] = new String("rhsz "); t_names[t_rhs] = new String("rhs "); t_names[t_xsolve] = new String("xsolve "); t_names[t_ysolve] = new String("ysolve "); t_names[t_zsolve] = new String("zsolve "); t_names[t_rdis1] = new String("redist1 "); t_names[t_rdis2] = new String("redist2 "); t_names[t_add] = new String("add "); } } public void rhs_norm( double rms[], int rmsoffst ){ int i, j, k, d, m; double add; for(m=0;m=0;i--){ for(m=0;m<=BLOCK_SIZE-1;m++){ for(n=0;n<=BLOCK_SIZE-1;n++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] - lhs[m+n*isize4+cc*jsize4+i*ksize4]*rhs[n+(i+1)*isize2+j*jsize2+k*ksize2]; } } } } } if(timeron)timer.stop(t_xsolve); } public void compute_rhs(){ int i, j, k, m; double rho_inv, uijk, up1, um1, vijk, vp1, vm1, wijk, wp1, wm1; if (timeron)timer.start(t_rhs); //--------------------------------------------------------------------- // compute the reciprocal of density, and the kinetic energy, // and the speed of sound. //--------------------------------------------------------------------- for(k=0;k<=grid_points[2]-1;k++){ for(j=0;j<=grid_points[1]-1;j++){ for(i=0;i<=grid_points[0]-1;i++){ rho_inv = 1.0/u[0+i*isize2+j*jsize2+k*ksize2]; rho_i[i+j*jsize1+k*ksize1] = rho_inv; us[i+j*jsize1+k*ksize1] = u[1+i*isize2+j*jsize2+k*ksize2] * rho_inv; vs[i+j*jsize1+k*ksize1] = u[2+i*isize2+j*jsize2+k*ksize2] * rho_inv; ws[i+j*jsize1+k*ksize1] = u[3+i*isize2+j*jsize2+k*ksize2] * rho_inv; square[i+j*jsize1+k*ksize1] = 0.5* ( u[1+i*isize2+j*jsize2+k*ksize2]*u[1+i*isize2+j*jsize2+k*ksize2] + u[2+i*isize2+j*jsize2+k*ksize2]*u[2+i*isize2+j*jsize2+k*ksize2] + u[3+i*isize2+j*jsize2+k*ksize2]*u[3+i*isize2+j*jsize2+k*ksize2] ) * rho_inv; qs[i+j*jsize1+k*ksize1] = square[i+j*jsize1+k*ksize1] * rho_inv; } } } //--------------------------------------------------------------------- // copy the exact forcing term to the right hand side; because // this forcing term is known, we can store it on the whole grid // including the boundary //--------------------------------------------------------------------- for(k=0;k<=grid_points[2]-1;k++){ for(j=0;j<=grid_points[1]-1;j++){ for(i=0;i<=grid_points[0]-1;i++){ for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = forcing[m+i*isize2+j*jsize2+k*ksize2]; } } } } if (timeron) timer.start(t_rhsx); //--------------------------------------------------------------------- // compute xi-direction fluxes //--------------------------------------------------------------------- for(k=1;k<=grid_points[2]-2;k++){ for(j=1;j<=grid_points[1]-2;j++){ for(i=1;i<=grid_points[0]-2;i++){ uijk = us[i+j*jsize1+k*ksize1]; up1 = us[(i+1)+j*jsize1+k*ksize1]; um1 = us[(i-1)+j*jsize1+k*ksize1]; rhs[0+i*isize2+j*jsize2+k*ksize2] = rhs[0+i*isize2+j*jsize2+k*ksize2] + dx1tx1 * (u[0+(i+1)*isize2+j*jsize2+k*ksize2] - 2.0*u[0+i*isize2+j*jsize2+k*ksize2] + u[0+(i-1)*isize2+j*jsize2+k*ksize2]) - tx2 * (u[1+(i+1)*isize2+j*jsize2+k*ksize2] - u[1+(i-1)*isize2+j*jsize2+k*ksize2]); rhs[1+i*isize2+j*jsize2+k*ksize2] = rhs[1+i*isize2+j*jsize2+k*ksize2] + dx2tx1 * (u[1+(i+1)*isize2+j*jsize2+k*ksize2] - 2.0*u[1+i*isize2+j*jsize2+k*ksize2] + u[1+(i-1)*isize2+j*jsize2+k*ksize2]) + xxcon2*con43 * (up1 - 2.0*uijk + um1) - tx2 * (u[1+(i+1)*isize2+j*jsize2+k*ksize2]*up1 - u[1+(i-1)*isize2+j*jsize2+k*ksize2]*um1 + (u[4+(i+1)*isize2+j*jsize2+k*ksize2]- square[(i+1)+j*jsize1+k*ksize1]- u[4+(i-1)*isize2+j*jsize2+k*ksize2]+ square[(i-1)+j*jsize1+k*ksize1])* c2); rhs[2+i*isize2+j*jsize2+k*ksize2] = rhs[2+i*isize2+j*jsize2+k*ksize2] + dx3tx1 * (u[2+(i+1)*isize2+j*jsize2+k*ksize2] - 2.0*u[2+i*isize2+j*jsize2+k*ksize2] + u[2+(i-1)*isize2+j*jsize2+k*ksize2]) + xxcon2 * (vs[(i+1)+j*jsize1+k*ksize1] - 2.0*vs[i+j*jsize1+k*ksize1] + vs[(i-1)+j*jsize1+k*ksize1]) - tx2 * (u[2+(i+1)*isize2+j*jsize2+k*ksize2]*up1 - u[2+(i-1)*isize2+j*jsize2+k*ksize2]*um1); rhs[3+i*isize2+j*jsize2+k*ksize2] = rhs[3+i*isize2+j*jsize2+k*ksize2] + dx4tx1 * (u[3+(i+1)*isize2+j*jsize2+k*ksize2] - 2.0*u[3+i*isize2+j*jsize2+k*ksize2] + u[3+(i-1)*isize2+j*jsize2+k*ksize2]) + xxcon2 * (ws[(i+1)+j*jsize1+k*ksize1] - 2.0*ws[i+j*jsize1+k*ksize1] + ws[(i-1)+j*jsize1+k*ksize1]) - tx2 * (u[3+(i+1)*isize2+j*jsize2+k*ksize2]*up1 - u[3+(i-1)*isize2+j*jsize2+k*ksize2]*um1); rhs[4+i*isize2+j*jsize2+k*ksize2] = rhs[4+i*isize2+j*jsize2+k*ksize2] + dx5tx1 * (u[4+(i+1)*isize2+j*jsize2+k*ksize2] - 2.0*u[4+i*isize2+j*jsize2+k*ksize2] + u[4+(i-1)*isize2+j*jsize2+k*ksize2]) + xxcon3 * (qs[(i+1)+j*jsize1+k*ksize1] - 2.0*qs[i+j*jsize1+k*ksize1] + qs[(i-1)+j*jsize1+k*ksize1]) + xxcon4 * (up1*up1 - 2.0*uijk*uijk + um1*um1) + xxcon5 * (u[4+(i+1)*isize2+j*jsize2+k*ksize2]*rho_i[(i+1)+j*jsize1+k*ksize1] - 2.0*u[4+i*isize2+j*jsize2+k*ksize2]*rho_i[i+j*jsize1+k*ksize1] + u[4+(i-1)*isize2+j*jsize2+k*ksize2]*rho_i[(i-1)+j*jsize1+k*ksize1]) - tx2 * ( (c1*u[4+(i+1)*isize2+j*jsize2+k*ksize2] - c2*square[(i+1)+j*jsize1+k*ksize1])*up1 - (c1*u[4+(i-1)*isize2+j*jsize2+k*ksize2] - c2*square[(i-1)+j*jsize1+k*ksize1])*um1 ); } } //--------------------------------------------------------------------- // add fourth order xi-direction dissipation //--------------------------------------------------------------------- for(j=1;j<=grid_points[1]-2;j++){ i = 1; for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2]- dssp * ( 5.0*u[m+i*isize2+j*jsize2+k*ksize2] - 4.0*u[m+(i+1)*isize2+j*jsize2+k*ksize2] + u[m+(i+2)*isize2+j*jsize2+k*ksize2]); } i = 2; for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] - dssp * (-4.0*u[m+(i-1)*isize2+j*jsize2+k*ksize2] + 6.0*u[m+i*isize2+j*jsize2+k*ksize2] - 4.0*u[m+(i+1)*isize2+j*jsize2+k*ksize2] + u[m+(i+2)*isize2+j*jsize2+k*ksize2]); } for(m=0;m<=4;m++){ for(i=3;i<=grid_points[0]-4;i++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] - dssp * ( u[m+(i-2)*isize2+j*jsize2+k*ksize2] - 4.0*u[m+(i-1)*isize2+j*jsize2+k*ksize2] + 6.0*u[m+i*isize2+j*jsize2+k*ksize2] - 4.0*u[m+(i+1)*isize2+j*jsize2+k*ksize2] + u[m+(i+2)*isize2+j*jsize2+k*ksize2] ); } } i = grid_points[0]-3; for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] - dssp * ( u[m+(i-2)*isize2+j*jsize2+k*ksize2] - 4.0*u[m+(i-1)*isize2+j*jsize2+k*ksize2] + 6.0*u[m+i*isize2+j*jsize2+k*ksize2] - 4.0*u[m+(i+1)*isize2+j*jsize2+k*ksize2] ); } i = grid_points[0]-2; for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] - dssp * ( u[m+(i-2)*isize2+j*jsize2+k*ksize2] - 4.*u[m+(i-1)*isize2+j*jsize2+k*ksize2] + 5.*u[m+i*isize2+j*jsize2+k*ksize2] ); } } } if (timeron) timer.stop(t_rhsx); if (timeron) timer.start(t_rhsy); //--------------------------------------------------------------------- // compute eta-direction fluxes //--------------------------------------------------------------------- for(k=1;k<=grid_points[2]-2;k++){ for(j=1;j<=grid_points[1]-2;j++){ for(i=1;i<=grid_points[0]-2;i++){ vijk = vs[i+j*jsize1+k*ksize1]; vp1 = vs[i+(j+1)*jsize1+k*ksize1]; vm1 = vs[i+(j-1)*jsize1+k*ksize1]; rhs[0+i*isize2+j*jsize2+k*ksize2] = rhs[0+i*isize2+j*jsize2+k*ksize2] + dy1ty1 * (u[0+i*isize2+(j+1)*jsize2+k*ksize2] - 2.0*u[0+i*isize2+j*jsize2+k*ksize2] + u[0+i*isize2+(j-1)*jsize2+k*ksize2]) - ty2 * (u[2+i*isize2+(j+1)*jsize2+k*ksize2] - u[2+i*isize2+(j-1)*jsize2+k*ksize2]); rhs[1+i*isize2+j*jsize2+k*ksize2] = rhs[1+i*isize2+j*jsize2+k*ksize2] + dy2ty1 * (u[1+i*isize2+(j+1)*jsize2+k*ksize2] - 2.0*u[1+i*isize2+j*jsize2+k*ksize2] + u[1+i*isize2+(j-1)*jsize2+k*ksize2]) + yycon2 * (us[i+(j+1)*jsize1+k*ksize1] - 2.0*us[i+j*jsize1+k*ksize1] + us[i+(j-1)*jsize1+k*ksize1]) - ty2 * (u[1+i*isize2+(j+1)*jsize2+k*ksize2]*vp1 - u[1+i*isize2+(j-1)*jsize2+k*ksize2]*vm1); rhs[2+i*isize2+j*jsize2+k*ksize2] = rhs[2+i*isize2+j*jsize2+k*ksize2] + dy3ty1 * (u[2+i*isize2+(j+1)*jsize2+k*ksize2] - 2.0*u[2+i*isize2+j*jsize2+k*ksize2] + u[2+i*isize2+(j-1)*jsize2+k*ksize2]) + yycon2*con43 * (vp1 - 2.0*vijk + vm1) - ty2 * (u[2+i*isize2+(j+1)*jsize2+k*ksize2]*vp1 - u[2+i*isize2+(j-1)*jsize2+k*ksize2]*vm1 + (u[4+i*isize2+(j+1)*jsize2+k*ksize2] - square[i+(j+1)*jsize1+k*ksize1] - u[4+i*isize2+(j-1)*jsize2+k*ksize2] + square[i+(j-1)*jsize1+k*ksize1]) *c2); rhs[3+i*isize2+j*jsize2+k*ksize2] = rhs[3+i*isize2+j*jsize2+k*ksize2] + dy4ty1 * (u[3+i*isize2+(j+1)*jsize2+k*ksize2] - 2.0*u[3+i*isize2+j*jsize2+k*ksize2] + u[3+i*isize2+(j-1)*jsize2+k*ksize2]) + yycon2 * (ws[i+(j+1)*jsize1+k*ksize1] - 2.0*ws[i+j*jsize1+k*ksize1] + ws[i+(j-1)*jsize1+k*ksize1]) - ty2 * (u[3+i*isize2+(j+1)*jsize2+k*ksize2]*vp1 - u[3+i*isize2+(j-1)*jsize2+k*ksize2]*vm1); rhs[4+i*isize2+j*jsize2+k*ksize2] = rhs[4+i*isize2+j*jsize2+k*ksize2] + dy5ty1 * (u[4+i*isize2+(j+1)*jsize2+k*ksize2] - 2.0*u[4+i*isize2+j*jsize2+k*ksize2] + u[4+i*isize2+(j-1)*jsize2+k*ksize2]) + yycon3 * (qs[i+(j+1)*jsize1+k*ksize1] - 2.0*qs[i+j*jsize1+k*ksize1] + qs[i+(j-1)*jsize1+k*ksize1]) + yycon4 * (vp1*vp1 - 2.0*vijk*vijk + vm1*vm1) + yycon5 * (u[4+i*isize2+(j+1)*jsize2+k*ksize2]*rho_i[i+(j+1)*jsize1+k*ksize1] - 2.0*u[4+i*isize2+j*jsize2+k*ksize2]*rho_i[i+j*jsize1+k*ksize1] + u[4+i*isize2+(j-1)*jsize2+k*ksize2]*rho_i[i+(j-1)*jsize1+k*ksize1]) - ty2 * ((c1*u[4+i*isize2+(j+1)*jsize2+k*ksize2] - c2*square[i+(j+1)*jsize1+k*ksize1]) * vp1 - (c1*u[4+i*isize2+(j-1)*jsize2+k*ksize2] - c2*square[i+(j-1)*jsize1+k*ksize1]) * vm1); } } //--------------------------------------------------------------------- // add fourth order eta-direction dissipation //--------------------------------------------------------------------- for(i=1;i<=grid_points[0]-2;i++){ j = 1; for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2]- dssp * ( 5.0*u[m+i*isize2+j*jsize2+k*ksize2] - 4.0*u[m+i*isize2+(j+1)*jsize2+k*ksize2] + u[m+i*isize2+(j+2)*jsize2+k*ksize2]); } j = 2; for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] - dssp * (-4.0*u[m+i*isize2+(j-1)*jsize2+k*ksize2] + 6.0*u[m+i*isize2+j*jsize2+k*ksize2] - 4.0*u[m+i*isize2+(j+1)*jsize2+k*ksize2] + u[m+i*isize2+(j+2)*jsize2+k*ksize2]); } } for(j=3;j<=grid_points[1]-4;j++){ for(i=1;i<=grid_points[0]-2;i++){ for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] - dssp * ( u[m+i*isize2+(j-2)*jsize2+k*ksize2] - 4.0*u[m+i*isize2+(j-1)*jsize2+k*ksize2] + 6.0*u[m+i*isize2+j*jsize2+k*ksize2] - 4.0*u[m+i*isize2+(j+1)*jsize2+k*ksize2] + u[m+i*isize2+(j+2)*jsize2+k*ksize2] ); } } } for(i=1;i<=grid_points[0]-2;i++){ j = grid_points[1]-3; for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] - dssp * ( u[m+i*isize2+(j-2)*jsize2+k*ksize2] - 4.0*u[m+i*isize2+(j-1)*jsize2+k*ksize2] + 6.0*u[m+i*isize2+j*jsize2+k*ksize2] - 4.0*u[m+i*isize2+(j+1)*jsize2+k*ksize2] ); } j = grid_points[1]-2; for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] - dssp * ( u[m+i*isize2+(j-2)*jsize2+k*ksize2] - 4.*u[m+i*isize2+(j-1)*jsize2+k*ksize2] + 5.*u[m+i*isize2+j*jsize2+k*ksize2] ); } } } if (timeron) timer.stop(t_rhsy); if (timeron) timer.start(t_rhsz); //--------------------------------------------------------------------- // compute zeta-direction fluxes //--------------------------------------------------------------------- for(k=1;k<=grid_points[2]-2;k++){ for(j=1;j<=grid_points[1]-2;j++){ for(i=1;i<=grid_points[0]-2;i++){ wijk = ws[i+j*jsize1+k*ksize1]; wp1 = ws[i+j*jsize1+(k+1)*ksize1]; wm1 = ws[i+j*jsize1+(k-1)*ksize1]; rhs[0+i*isize2+j*jsize2+k*ksize2] = rhs[0+i*isize2+j*jsize2+k*ksize2] + dz1tz1 * (u[0+i*isize2+j*jsize2+(k+1)*ksize2] - 2.0*u[0+i*isize2+j*jsize2+k*ksize2] + u[0+i*isize2+j*jsize2+(k-1)*ksize2]) - tz2 * (u[3+i*isize2+j*jsize2+(k+1)*ksize2] - u[3+i*isize2+j*jsize2+(k-1)*ksize2]); rhs[1+i*isize2+j*jsize2+k*ksize2] = rhs[1+i*isize2+j*jsize2+k*ksize2] + dz2tz1 * (u[1+i*isize2+j*jsize2+(k+1)*ksize2] - 2.0*u[1+i*isize2+j*jsize2+k*ksize2] + u[1+i*isize2+j*jsize2+(k-1)*ksize2]) + zzcon2 * (us[i+j*jsize1+(k+1)*ksize1] - 2.0*us[i+j*jsize1+k*ksize1] + us[i+j*jsize1+(k-1)*ksize1]) - tz2 * (u[1+i*isize2+j*jsize2+(k+1)*ksize2]*wp1 - u[1+i*isize2+j*jsize2+(k-1)*ksize2]*wm1); rhs[2+i*isize2+j*jsize2+k*ksize2] = rhs[2+i*isize2+j*jsize2+k*ksize2] + dz3tz1 * (u[2+i*isize2+j*jsize2+(k+1)*ksize2] - 2.0*u[2+i*isize2+j*jsize2+k*ksize2] + u[2+i*isize2+j*jsize2+(k-1)*ksize2]) + zzcon2 * (vs[i+j*jsize1+(k+1)*ksize1] - 2.0*vs[i+j*jsize1+k*ksize1] + vs[i+j*jsize1+(k-1)*ksize1]) - tz2 * (u[2+i*isize2+j*jsize2+(k+1)*ksize2]*wp1 - u[2+i*isize2+j*jsize2+(k-1)*ksize2]*wm1); rhs[3+i*isize2+j*jsize2+k*ksize2] = rhs[3+i*isize2+j*jsize2+k*ksize2] + dz4tz1 * (u[3+i*isize2+j*jsize2+(k+1)*ksize2] - 2.0*u[3+i*isize2+j*jsize2+k*ksize2] + u[3+i*isize2+j*jsize2+(k-1)*ksize2]) + zzcon2*con43 * (wp1 - 2.0*wijk + wm1) - tz2 * (u[3+i*isize2+j*jsize2+(k+1)*ksize2]*wp1 - u[3+i*isize2+j*jsize2+(k-1)*ksize2]*wm1 + (u[4+i*isize2+j*jsize2+(k+1)*ksize2] - square[i+j*jsize1+(k+1)*ksize1] - u[4+i*isize2+j*jsize2+(k-1)*ksize2] + square[i+j*jsize1+(k-1)*ksize1]) *c2); rhs[4+i*isize2+j*jsize2+k*ksize2] = rhs[4+i*isize2+j*jsize2+k*ksize2] + dz5tz1 * (u[4+i*isize2+j*jsize2+(k+1)*ksize2] - 2.0*u[4+i*isize2+j*jsize2+k*ksize2] + u[4+i*isize2+j*jsize2+(k-1)*ksize2]) + zzcon3 * (qs[i+j*jsize1+(k+1)*ksize1] - 2.0*qs[i+j*jsize1+k*ksize1] + qs[i+j*jsize1+(k-1)*ksize1]) + zzcon4 * (wp1*wp1 - 2.0*wijk*wijk + wm1*wm1) + zzcon5 * (u[4+i*isize2+j*jsize2+(k+1)*ksize2]*rho_i[i+j*jsize1+(k+1)*ksize1] - 2.0*u[4+i*isize2+j*jsize2+k*ksize2]*rho_i[i+j*jsize1+k*ksize1] + u[4+i*isize2+j*jsize2+(k-1)*ksize2]*rho_i[i+j*jsize1+(k-1)*ksize1]) - tz2 * ( (c1*u[4+i*isize2+j*jsize2+(k+1)*ksize2] - c2*square[i+j*jsize1+(k+1)*ksize1])*wp1 - (c1*u[4+i*isize2+j*jsize2+(k-1)*ksize2] - c2*square[i+j*jsize1+(k-1)*ksize1])*wm1); } } } //--------------------------------------------------------------------- // add fourth order zeta-direction dissipation //--------------------------------------------------------------------- for(j=1;j<=grid_points[1]-2;j++){ for(i=1;i<=grid_points[0]-2;i++){ k = 1; for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2]- dssp * ( 5.0*u[m+i*isize2+j*jsize2+k*ksize2] - 4.0*u[m+i*isize2+j*jsize2+(k+1)*ksize2] + u[m+i*isize2+j*jsize2+(k+2)*ksize2]); } k = 2; for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] - dssp * (-4.0*u[m+i*isize2+j*jsize2+(k-1)*ksize2] + 6.0*u[m+i*isize2+j*jsize2+k*ksize2] - 4.0*u[m+i*isize2+j*jsize2+(k+1)*ksize2] + u[m+i*isize2+j*jsize2+(k+2)*ksize2]); } } } for(k=3;k<=grid_points[2]-4;k++){ for(j=1;j<=grid_points[1]-2;j++){ for(i=1;i<=grid_points[0]-2;i++){ for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] - dssp * ( u[m+i*isize2+j*jsize2+(k-2)*ksize2] - 4.0*u[m+i*isize2+j*jsize2+(k-1)*ksize2] + 6.0*u[m+i*isize2+j*jsize2+k*ksize2] - 4.0*u[m+i*isize2+j*jsize2+(k+1)*ksize2] + u[m+i*isize2+j*jsize2+(k+2)*ksize2] ); } } } } for(j=1;j<=grid_points[1]-2;j++){ for(i=1;i<=grid_points[0]-2;i++){ k = grid_points[2]-3; for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] - dssp * ( u[m+i*isize2+j*jsize2+(k-2)*ksize2] - 4.0*u[m+i*isize2+j*jsize2+(k-1)*ksize2] + 6.0*u[m+i*isize2+j*jsize2+k*ksize2] - 4.0*u[m+i*isize2+j*jsize2+(k+1)*ksize2] ); } k = grid_points[2]-2; for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] - dssp * ( u[m+i*isize2+j*jsize2+(k-2)*ksize2] - 4.*u[m+i*isize2+j*jsize2+(k-1)*ksize2] + 5.0*u[m+i*isize2+j*jsize2+k*ksize2] ); } } } if (timeron) timer.stop(t_rhsz); for(k=1;k<=grid_points[2]-2;k++){ for(j=1;j<=grid_points[1]-2;j++){ for(i=1;i<=grid_points[0]-2;i++){ for(m=0;m<=4;m++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] * dt; } } } } if (timeron) timer.stop(t_rhs); } public void print_lhs(){ double count1=0,count2=0,count3=0; for(int i=0;i<5;i++){ for(int j=0;j<5;j++){ for(int m=0;m=0;j--){ for(m=0;m<=BLOCK_SIZE-1;m++){ for(n=0;n<=BLOCK_SIZE-1;n++){ rhs[m+i*isize2+j*jsize2+k*ksize2] = rhs[m+i*isize2+j*jsize2+k*ksize2] - lhs[m+n*isize4+cc*jsize4+j*ksize4]*rhs[n+i*isize2+(j+1)*jsize2+k*ksize2]; } } } } } if (timeron) timer.stop(t_ysolve); } public void z_solve(){ int i, j, k, m, n, ksize; if (timeron) timer.start(t_zsolve); //--------------------------------------------------------------------- // This function computes the left hand side for the three z-factors //--------------------------------------------------------------------- ksize = grid_points[2]-1; //--------------------------------------------------------------------- // Compute the indices for storing the block-diagonal matrix; // determine c (labeled f) and s jacobians //--------------------------------------------------------------------- for(j=1;j<=grid_points[1]-2;j++){ for(i=1;i<=grid_points[0]-2;i++){ for(k=0;k<=ksize;k++){ tmp1 = 1.0 / u[0+i*isize2+j*jsize2+k*ksize2]; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; fjac[0+0*isize4+k*jsize4] = 0.0; fjac[0+1*isize4+k*jsize4] = 0.0; fjac[0+2*isize4+k*jsize4] = 0.0; fjac[0+3*isize4+k*jsize4] = 1.0; fjac[0+4*isize4+k*jsize4] = 0.0; fjac[1+0*isize4+k*jsize4] = - ( u[1+i*isize2+j*jsize2+k*ksize2]*u[3+i*isize2+j*jsize2+k*ksize2] ) * tmp2 ; fjac[1+1*isize4+k*jsize4] = u[3+i*isize2+j*jsize2+k*ksize2] * tmp1; fjac[1+2*isize4+k*jsize4] = 0.0; fjac[1+3*isize4+k*jsize4] = u[1+i*isize2+j*jsize2+k*ksize2] * tmp1; fjac[1+4*isize4+k*jsize4] = 0.0; fjac[2+0*isize4+k*jsize4] = - ( u[2+i*isize2+j*jsize2+k*ksize2]*u[3+i*isize2+j*jsize2+k*ksize2] ) * tmp2 ; fjac[2+1*isize4+k*jsize4] = 0.0; fjac[2+2*isize4+k*jsize4] = u[3+i*isize2+j*jsize2+k*ksize2] * tmp1; fjac[2+3*isize4+k*jsize4] = u[2+i*isize2+j*jsize2+k*ksize2] * tmp1; fjac[2+4*isize4+k*jsize4] = 0.0; fjac[3+0*isize4+k*jsize4] = - (u[3+i*isize2+j*jsize2+k*ksize2]*u[3+i*isize2+j*jsize2+k*ksize2] * tmp2 ) + c2 * qs[i+j*jsize1+k*ksize1]; fjac[3+1*isize4+k*jsize4] = - c2 * u[1+i*isize2+j*jsize2+k*ksize2] * tmp1 ; fjac[3+2*isize4+k*jsize4] = - c2 * u[2+i*isize2+j*jsize2+k*ksize2] * tmp1; fjac[3+3*isize4+k*jsize4] = ( 2.0 - c2 ) * u[3+i*isize2+j*jsize2+k*ksize2] * tmp1; fjac[3+4*isize4+k*jsize4] = c2; fjac[4+0*isize4+k*jsize4] = ( c2 * 2.0 * square[i+j*jsize1+k*ksize1] - c1 * u[4+i*isize2+j*jsize2+k*ksize2] ) * u[3+i*isize2+j*jsize2+k*ksize2] * tmp2; fjac[4+1*isize4+k*jsize4] = - c2 * ( u[1+i*isize2+j*jsize2+k*ksize2]*u[3+i*isize2+j*jsize2+k*ksize2] ) * tmp2 ; fjac[4+2*isize4+k*jsize4] = - c2 * ( u[2+i*isize2+j*jsize2+k*ksize2]*u[3+i*isize2+j*jsize2+k*ksize2] ) * tmp2; fjac[4+3*isize4+k*jsize4] = c1 * ( u[4+i*isize2+j*jsize2+k*ksize2] * tmp1 ) - c2 * ( qs[i+j*jsize1+k*ksize1] + u[3+i*isize2+j*jsize2+k*ksize2]*u[3+i*isize2+j*jsize2+k*ksize2] * tmp2 ); fjac[4+4*isize4+k*jsize4] = c1 * u[3+i*isize2+j*jsize2+k*ksize2] * tmp1; njac[0+0*isize4+k*jsize4] = 0.0; njac[0+1*isize4+k*jsize4] = 0.0; njac[0+2*isize4+k*jsize4] = 0.0; njac[0+3*isize4+k*jsize4] = 0.0; njac[0+4*isize4+k*jsize4] = 0.0; njac[1+0*isize4+k*jsize4] = - c3c4 * tmp2 * u[1+i*isize2+j*jsize2+k*ksize2]; njac[1+1*isize4+k*jsize4] = c3c4 * tmp1; njac[1+2*isize4+k*jsize4] = 0.0; njac[1+3*isize4+k*jsize4] = 0.0; njac[1+4*isize4+k*jsize4] = 0.0; njac[2+0*isize4+k*jsize4] = - c3c4 * tmp2 * u[2+i*isize2+j*jsize2+k*ksize2]; njac[2+1*isize4+k*jsize4] = 0.0; njac[2+2*isize4+k*jsize4] = c3c4 * tmp1; njac[2+3*isize4+k*jsize4] = 0.0; njac[2+4*isize4+k*jsize4] = 0.0; njac[3+0*isize4+k*jsize4] = - con43 * c3c4 * tmp2 * u[3+i*isize2+j*jsize2+k*ksize2]; njac[3+1*isize4+k*jsize4] = 0.0; njac[3+2*isize4+k*jsize4] = 0.0; njac[3+3*isize4+k*jsize4] = con43 * c3 * c4 * tmp1; njac[3+4*isize4+k*jsize4] = 0.0; njac[4+0*isize4+k*jsize4] = - ( c3c4 - c1345 ) * tmp3 * (Math.pow(u[1+i*isize2+j*jsize2+k*ksize2],2)) - ( c3c4 - c1345 ) * tmp3 * (Math.pow(u[2+i*isize2+j*jsize2+k*ksize2],2)) - ( con43 * c3c4 - c1345 ) * tmp3 * (Math.pow(u[3+i*isize2+j*jsize2+k*ksize2],2)) - c1345 * tmp2 * u[4+i*isize2+j*jsize2+k*ksize2]; njac[4+1*isize4+k*jsize4] = ( c3c4 - c1345 ) * tmp2 * u[1+i*isize2+j*jsize2+k*ksize2]; njac[4+2*isize4+k*jsize4] = ( c3c4 - c1345 ) * tmp2 * u[2+i*isize2+j*jsize2+k*ksize2]; njac[4+3*isize4+k*jsize4] = ( con43 * c3c4 - c1345 ) * tmp2 * u[3+i*isize2+j*jsize2+k*ksize2]; njac[4+4*isize4+k*jsize4] = ( c1345 )* tmp1; } //--------------------------------------------------------------------- // now jacobians set, so form left hand side in z direction //--------------------------------------------------------------------- lhsinit(lhs, ksize); for(k=1;k<=ksize-1;k++){ tmp1 = dt * tz1; tmp2 = dt * tz2; lhs[0+0*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[0+0*isize4+(k-1)*jsize4] - tmp1 * njac[0+0*isize4+(k-1)*jsize4] - tmp1 * dz1 ; lhs[0+1*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[0+1*isize4+(k-1)*jsize4] - tmp1 * njac[0+1*isize4+(k-1)*jsize4]; lhs[0+2*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[0+2*isize4+(k-1)*jsize4] - tmp1 * njac[0+2*isize4+(k-1)*jsize4]; lhs[0+3*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[0+3*isize4+(k-1)*jsize4] - tmp1 * njac[0+3*isize4+(k-1)*jsize4]; lhs[0+4*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[0+4*isize4+(k-1)*jsize4] - tmp1 * njac[0+4*isize4+(k-1)*jsize4]; lhs[1+0*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[1+0*isize4+(k-1)*jsize4] - tmp1 * njac[1+0*isize4+(k-1)*jsize4]; lhs[1+1*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[1+1*isize4+(k-1)*jsize4] - tmp1 * njac[1+1*isize4+(k-1)*jsize4] - tmp1 * dz2; lhs[1+2*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[1+2*isize4+(k-1)*jsize4] - tmp1 * njac[1+2*isize4+(k-1)*jsize4]; lhs[1+3*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[1+3*isize4+(k-1)*jsize4] - tmp1 * njac[1+3*isize4+(k-1)*jsize4]; lhs[1+4*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[1+4*isize4+(k-1)*jsize4] - tmp1 * njac[1+4*isize4+(k-1)*jsize4]; lhs[2+0*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[2+0*isize4+(k-1)*jsize4] - tmp1 * njac[2+0*isize4+(k-1)*jsize4]; lhs[2+1*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[2+1*isize4+(k-1)*jsize4] - tmp1 * njac[2+1*isize4+(k-1)*jsize4]; lhs[2+2*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[2+2*isize4+(k-1)*jsize4] - tmp1 * njac[2+2*isize4+(k-1)*jsize4] - tmp1 * dz3; lhs[2+3*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[2+3*isize4+(k-1)*jsize4] - tmp1 * njac[2+3*isize4+(k-1)*jsize4]; lhs[2+4*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[2+4*isize4+(k-1)*jsize4] - tmp1 * njac[2+4*isize4+(k-1)*jsize4]; lhs[3+0*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[3+0*isize4+(k-1)*jsize4] - tmp1 * njac[3+0*isize4+(k-1)*jsize4]; lhs[3+1*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[3+1*isize4+(k-1)*jsize4] - tmp1 * njac[3+1*isize4+(k-1)*jsize4]; lhs[3+2*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[3+2*isize4+(k-1)*jsize4] - tmp1 * njac[3+2*isize4+(k-1)*jsize4]; lhs[3+3*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[3+3*isize4+(k-1)*jsize4] - tmp1 * njac[3+3*isize4+(k-1)*jsize4] - tmp1 * dz4; lhs[3+4*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[3+4*isize4+(k-1)*jsize4] - tmp1 * njac[3+4*isize4+(k-1)*jsize4]; lhs[4+0*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[4+0*isize4+(k-1)*jsize4] - tmp1 * njac[4+0*isize4+(k-1)*jsize4]; lhs[4+1*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[4+1*isize4+(k-1)*jsize4] - tmp1 * njac[4+1*isize4+(k-1)*jsize4]; lhs[4+2*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[4+2*isize4+(k-1)*jsize4] - tmp1 * njac[4+2*isize4+(k-1)*jsize4]; lhs[4+3*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[4+3*isize4+(k-1)*jsize4] - tmp1 * njac[4+3*isize4+(k-1)*jsize4]; lhs[4+4*isize4+aa*jsize4+k*ksize4] = - tmp2 * fjac[4+4*isize4+(k-1)*jsize4] - tmp1 * njac[4+4*isize4+(k-1)*jsize4] - tmp1 * dz5; lhs[0+0*isize4+bb*jsize4+k*ksize4] = 1.0 + tmp1 * 2.0 * njac[0+0*isize4+k*jsize4] + tmp1 * 2.0 * dz1; lhs[0+1*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[0+1*isize4+k*jsize4]; lhs[0+2*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[0+2*isize4+k*jsize4]; lhs[0+3*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[0+3*isize4+k*jsize4]; lhs[0+4*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[0+4*isize4+k*jsize4]; lhs[1+0*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[1+0*isize4+k*jsize4]; lhs[1+1*isize4+bb*jsize4+k*ksize4] = 1.0 + tmp1 * 2.0 * njac[1+1*isize4+k*jsize4] + tmp1 * 2.0 * dz2; lhs[1+2*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[1+2*isize4+k*jsize4]; lhs[1+3*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[1+3*isize4+k*jsize4]; lhs[1+4*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[1+4*isize4+k*jsize4]; lhs[2+0*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[2+0*isize4+k*jsize4]; lhs[2+1*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[2+1*isize4+k*jsize4]; lhs[2+2*isize4+bb*jsize4+k*ksize4] = 1.0 + tmp1 * 2.0 * njac[2+2*isize4+k*jsize4] + tmp1 * 2.0 * dz3; lhs[2+3*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[2+3*isize4+k*jsize4]; lhs[2+4*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[2+4*isize4+k*jsize4]; lhs[3+0*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[3+0*isize4+k*jsize4]; lhs[3+1*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[3+1*isize4+k*jsize4]; lhs[3+2*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[3+2*isize4+k*jsize4]; lhs[3+3*isize4+bb*jsize4+k*ksize4] = 1.0 + tmp1 * 2.0 * njac[3+3*isize4+k*jsize4] + tmp1 * 2.0 * dz4; lhs[3+4*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[3+4*isize4+k*jsize4]; lhs[4+0*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[4+0*isize4+k*jsize4]; lhs[4+1*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[4+1*isize4+k*jsize4]; lhs[4+2*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[4+2*isize4+k*jsize4]; lhs[4+3*isize4+bb*jsize4+k*ksize4] = tmp1 * 2.0 * njac[4+3*isize4+k*jsize4]; lhs[4+4*isize4+bb*jsize4+k*ksize4] = 1.0 + tmp1 * 2.0 * njac[4+4*isize4+k*jsize4] + tmp1 * 2.0 * dz5; lhs[0+0*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[0+0*isize4+(k+1)*jsize4] - tmp1 * njac[0+0*isize4+(k+1)*jsize4] - tmp1 * dz1; lhs[0+1*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[0+1*isize4+(k+1)*jsize4] - tmp1 * njac[0+1*isize4+(k+1)*jsize4]; lhs[0+2*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[0+2*isize4+(k+1)*jsize4] - tmp1 * njac[0+2*isize4+(k+1)*jsize4]; lhs[0+3*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[0+3*isize4+(k+1)*jsize4] - tmp1 * njac[0+3*isize4+(k+1)*jsize4]; lhs[0+4*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[0+4*isize4+(k+1)*jsize4] - tmp1 * njac[0+4*isize4+(k+1)*jsize4]; lhs[1+0*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[1+0*isize4+(k+1)*jsize4] - tmp1 * njac[1+0*isize4+(k+1)*jsize4]; lhs[1+1*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[1+1*isize4+(k+1)*jsize4] - tmp1 * njac[1+1*isize4+(k+1)*jsize4] - tmp1 * dz2; lhs[1+2*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[1+2*isize4+(k+1)*jsize4] - tmp1 * njac[1+2*isize4+(k+1)*jsize4]; lhs[1+3*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[1+3*isize4+(k+1)*jsize4] - tmp1 * njac[1+3*isize4+(k+1)*jsize4]; lhs[1+4*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[1+4*isize4+(k+1)*jsize4] - tmp1 * njac[1+4*isize4+(k+1)*jsize4]; lhs[2+0*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[2+0*isize4+(k+1)*jsize4] - tmp1 * njac[2+0*isize4+(k+1)*jsize4]; lhs[2+1*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[2+1*isize4+(k+1)*jsize4] - tmp1 * njac[2+1*isize4+(k+1)*jsize4]; lhs[2+2*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[2+2*isize4+(k+1)*jsize4] - tmp1 * njac[2+2*isize4+(k+1)*jsize4] - tmp1 * dz3; lhs[2+3*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[2+3*isize4+(k+1)*jsize4] - tmp1 * njac[2+3*isize4+(k+1)*jsize4]; lhs[2+4*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[2+4*isize4+(k+1)*jsize4] - tmp1 * njac[2+4*isize4+(k+1)*jsize4]; lhs[3+0*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[3+0*isize4+(k+1)*jsize4] - tmp1 * njac[3+0*isize4+(k+1)*jsize4]; lhs[3+1*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[3+1*isize4+(k+1)*jsize4] - tmp1 * njac[3+1*isize4+(k+1)*jsize4]; lhs[3+2*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[3+2*isize4+(k+1)*jsize4] - tmp1 * njac[3+2*isize4+(k+1)*jsize4]; lhs[3+3*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[3+3*isize4+(k+1)*jsize4] - tmp1 * njac[3+3*isize4+(k+1)*jsize4] - tmp1 * dz4; lhs[3+4*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[3+4*isize4+(k+1)*jsize4] - tmp1 * njac[3+4*isize4+(k+1)*jsize4]; lhs[4+0*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[4+0*isize4+(k+1)*jsize4] - tmp1 * njac[4+0*isize4+(k+1)*jsize4]; lhs[4+1*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[4+1*isize4+(k+1)*jsize4] - tmp1 * njac[4+1*isize4+(k+1)*jsize4]; lhs[4+2*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[4+2*isize4+(k+1)*jsize4] - tmp1 * njac[4+2*isize4+(k+1)*jsize4]; lhs[4+3*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[4+3*isize4+(k+1)*jsize4] - tmp1 * njac[4+3*isize4+(k+1)*jsize4]; lhs[4+4*isize4+cc*jsize4+k*ksize4] = tmp2 * fjac[4+4*isize4+(k+1)*jsize4] - tmp1 * njac[4+4*isize4+(k+1)*jsize4] - tmp1 * dz5; } //--------------------------------------------------------------------- // performs guaussian elimination on this cell. // // assumes that unpacking routines for non-first cells // preload C' and rhs' from previous cell. // // assumed send happens outside this routine, but that // c'(KMAX) and rhs'(KMAX) will be sent to next cell. //--------------------------------------------------------------------- //--------------------------------------------------------------------- // outer most do loops - sweeping in i direction //--------------------------------------------------------------------- //--------------------------------------------------------------------- // multiply c(i,j,0) by b_inverse and copy back to c // multiply rhs(0) by b_inverse(0) and copy to rhs //--------------------------------------------------------------------- binvcrhs( lhs,0+0*isize4+bb*jsize4+0*ksize4, lhs,0+0*isize4+cc*jsize4+0*ksize4, rhs,0+i*isize2+j*jsize2+0*ksize2 ); //--------------------------------------------------------------------- // begin inner most do loop // do all the elements of the cell unless last //--------------------------------------------------------------------- for(k=1;k<=ksize-1;k++){ //--------------------------------------------------------------------- // subtract A*lhs_vector(k-1) from lhs_vector(k) // // rhs(k) = rhs(k) - A*rhs(k-1) //--------------------------------------------------------------------- matvec_sub(lhs,0+0*isize4+aa*jsize4+k*ksize4, rhs,0+i*isize2+j*jsize2+(k-1)*ksize2, rhs,0+i*isize2+j*jsize2+k*ksize2); //--------------------------------------------------------------------- // B(k) = B(k) - C(k-1)*A(k) // call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k) //--------------------------------------------------------------------- matmul_sub(lhs,0+0*isize4+aa*jsize4+k*ksize4, lhs,0+0*isize4+cc*jsize4+(k-1)*ksize4, lhs,0+0*isize4+bb*jsize4+k*ksize4); //--------------------------------------------------------------------- // multiply c(i,j,k) by b_inverse and copy back to c // multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs //--------------------------------------------------------------------- binvcrhs( lhs,0+0*isize4+bb*jsize4+k*ksize4, lhs,0+0*isize4+cc*jsize4+k*ksize4, rhs,0+i*isize2+j*jsize2+k*ksize2 ); } //--------------------------------------------------------------------- // Now finish up special cases for last cell //--------------------------------------------------------------------- //--------------------------------------------------------------------- // rhs(ksize) = rhs(ksize) - A*rhs(ksize-1) //--------------------------------------------------------------------- matvec_sub(lhs,0+0*isize4+aa*jsize4+ksize*ksize4, rhs,0+i*isize2+j*jsize2+(ksize-1)*ksize2, rhs,0+i*isize2+j*jsize2+ksize*ksize2); //--------------------------------------------------------------------- // B(ksize) = B(ksize) - C(ksize-1)*A(ksize) // call matmul_sub(aa,i,j,ksize,c, // $ cc,i,j,ksize-1,c,bb,i,j,ksize) //--------------------------------------------------------------------- matmul_sub(lhs,0+0*isize4+aa*jsize4+ksize*ksize4, lhs,0+0*isize4+cc*jsize4+(ksize-1)*ksize4, lhs,0+0*isize4+bb*jsize4+ksize*ksize4); //--------------------------------------------------------------------- // multiply rhs(ksize) by b_inverse(ksize) and copy to rhs //--------------------------------------------------------------------- binvrhs( lhs,0+0*isize4+bb*jsize4+ksize*ksize4, rhs,0+i*isize2+j*jsize2+ksize*ksize2 ); //--------------------------------------------------------------------- // back solve: if last cell, then generate U(ksize)=rhs(ksize) // else assume U(ksize) is loaded in un pack backsub_info // so just use it // after call u(kstart) will be sent to next cell //--------------------------------------------------------------------- for(k=ksize-1;k>=0;k--){ for(m=0;m<=BLOCK_SIZE-1;m++){ for(n=0;n<=BLOCK_SIZE-1;n++){ rhs[m+i*isize2+j*jsize2+k*ksize2] += -lhs[m+n*isize4+cc*jsize4+k*ksize4] *rhs[n+i*isize2+j*jsize2+(k+1)*ksize2]; } } } } } if(timeron) timer.stop(t_zsolve); } public double checkSum(double arr[]){ double csum=0.0; for(int k=0;k<=grid_points[2]-1;k++){ for(int j=0;j<=grid_points[1]-1;j++){ for(int i=0;i<=grid_points[0]-1;i++){ for(int m=0;m<=4;m++){ int offset=m+i*isize2+j*jsize2+k*ksize2; csum+=(arr[offset]*arr[offset])/ (double)(grid_points[2]*grid_points[1]*grid_points[0]*5); } } } } return csum; } public double getTime(){return timer.readTimer(1);} public void finalize() throws Throwable{ results=null; fjac=null; njac=null; lhs=null; super.finalize(); } }