benign_test_resilience_c.jsonl

{"code": "#include \"npbparams.h\"\n#include <stdlib.h>\n#include <stdio.h>\n\n#define USE_BUCKETS\n\n\n/******************/\n/* default values */\n/******************/\n#ifndef CLASS\n#define CLASS 'S'\n#endif\n\n\n/*************/\n/*  CLASS S  */\n/*************/\n#if CLASS == 'S'\n#define  TOTAL_KEYS_LOG_2    16\n#define  MAX_KEY_LOG_2       11\n#define  NUM_BUCKETS_LOG_2   9\n#endif\n\n\n/*************/\n/*  CLASS W  */\n/*************/\n#if CLASS == 'W'\n#define  TOTAL_KEYS_LOG_2    20\n#define  MAX_KEY_LOG_2       16\n#define  NUM_BUCKETS_LOG_2   10\n#endif\n\n/*************/\n/*  CLASS A  */\n/*************/\n#if CLASS == 'A'\n#define  TOTAL_KEYS_LOG_2    23\n#define  MAX_KEY_LOG_2       19\n#define  NUM_BUCKETS_LOG_2   10\n#endif\n\n\n/*************/\n/*  CLASS B  */\n/*************/\n#if CLASS == 'B'\n#define  TOTAL_KEYS_LOG_2    25\n#define  MAX_KEY_LOG_2       21\n#define  NUM_BUCKETS_LOG_2   10\n#endif\n\n\n/*************/\n/*  CLASS C  */\n/*************/\n#if CLASS == 'C'\n#define  TOTAL_KEYS_LOG_2    27\n#define  MAX_KEY_LOG_2       23\n#define  NUM_BUCKETS_LOG_2   10\n#endif\n\n\n/*************/\n/*  CLASS D  */\n/*************/\n#if CLASS == 'D'\n#define  TOTAL_KEYS_LOG_2    31\n#define  MAX_KEY_LOG_2       27\n#define  NUM_BUCKETS_LOG_2   10\n#endif\n\n\n#if CLASS == 'D'\n#define  TOTAL_KEYS          (1L << TOTAL_KEYS_LOG_2)\n#else\n#define  TOTAL_KEYS          (1 << TOTAL_KEYS_LOG_2)\n#endif\n#define  MAX_KEY             (1 << MAX_KEY_LOG_2)\n#define  NUM_BUCKETS         (1 << NUM_BUCKETS_LOG_2)\n#define  NUM_KEYS            TOTAL_KEYS\n#define  SIZE_OF_BUFFERS     NUM_KEYS  \n                                           \n\n#define  MAX_ITERATIONS      10\n#define  TEST_ARRAY_SIZE     5\n\n\n/*************************************/\n/* Typedef: if necessary, change the */\n/* size of int here by changing the  */\n/* int type to, say, long            */\n/*************************************/\n#if CLASS == 'D'\ntypedef  long INT_TYPE;\n#else\ntypedef  int  INT_TYPE;\n#endif\n\n\n/********************/\n/* Some global info */\n/********************/\nINT_TYPE *key_buff_ptr_global;         /* used by full_verify to get */\n                                       /* copies of rank info        */\n\nint      passed_verification;\n                                 \n\n/************************************/\n/* These are the three main arrays. */\n/* See SIZE_OF_BUFFERS def above    */\n/************************************/\nINT_TYPE key_array[SIZE_OF_BUFFERS],    \n         key_buff1[MAX_KEY],    \n         key_buff2[SIZE_OF_BUFFERS],\n         partial_verify_vals[TEST_ARRAY_SIZE];\n\n#ifdef USE_BUCKETS\nINT_TYPE bucket_size[NUM_BUCKETS],                    \n         bucket_ptrs[NUM_BUCKETS];\n#endif\n\n\n/**********************/\n/* Partial verif info */\n/**********************/\nINT_TYPE test_index_array[TEST_ARRAY_SIZE],\n         test_rank_array[TEST_ARRAY_SIZE],\n\n         S_test_index_array[TEST_ARRAY_SIZE] = \n                             {48427,17148,23627,62548,4431},\n         S_test_rank_array[TEST_ARRAY_SIZE] = \n                             {0,18,346,64917,65463},\n\n         W_test_index_array[TEST_ARRAY_SIZE] = \n                             {357773,934767,875723,898999,404505},\n         W_test_rank_array[TEST_ARRAY_SIZE] = \n                             {1249,11698,1039987,1043896,1048018},\n\n         A_test_index_array[TEST_ARRAY_SIZE] = \n                             {2112377,662041,5336171,3642833,4250760},\n         A_test_rank_array[TEST_ARRAY_SIZE] = \n                             {104,17523,123928,8288932,8388264},\n\n         B_test_index_array[TEST_ARRAY_SIZE] = \n                             {41869,812306,5102857,18232239,26860214},\n         B_test_rank_array[TEST_ARRAY_SIZE] = \n                             {33422937,10244,59149,33135281,99}, \n\n         C_test_index_array[TEST_ARRAY_SIZE] = \n                             {44172927,72999161,74326391,129606274,21736814},\n         C_test_rank_array[TEST_ARRAY_SIZE] = \n                             {61147,882988,266290,133997595,133525895},\n\n         D_test_index_array[TEST_ARRAY_SIZE] = \n                             {1317351170,995930646,1157283250,1503301535,1453734525},\n         D_test_rank_array[TEST_ARRAY_SIZE] = \n                             {1,36538729,1978098519,2145192618,2147425337};\n\n\n\n/***********************/\n/* function prototypes */\n/***********************/\ndouble\trandlc( double *X, double *A );\n\nvoid full_verify( void );\n\nvoid c_print_results( char   *name,\n                      char   class,\n                      int    n1, \n                      int    n2,\n                      int    n3,\n                      int    niter,\n                      double t,\n                      double mops,\n\t\t      char   *optype,\n                      int    passed_verification,\n                      char   *npbversion,\n                      char   *compiletime,\n                      char   *cc,\n                      char   *clink,\n                      char   *c_lib,\n                      char   *c_inc,\n                      char   *cflags,\n                      char   *clinkflags );\n\n\nvoid    timer_clear( int n );\nvoid    timer_start( int n );\nvoid    timer_stop( int n );\ndouble  timer_read( int n );\n\n\n\ndouble\trandlc( double *X, double *A )\n{\n      static int        KS=0;\n      static double\tR23, R46, T23, T46;\n      double\t\tT1, T2, T3, T4;\n      double\t\tA1;\n      double\t\tA2;\n      double\t\tX1;\n      double\t\tX2;\n      double\t\tZ;\n      int     \t\ti, j;\n\n      if (KS == 0) \n      {\n        R23 = 1.0;\n        R46 = 1.0;\n        T23 = 1.0;\n        T46 = 1.0;\n    \n        for (i=1; i<=23; i++)\n        {\n          R23 = 0.50 * R23;\n          T23 = 2.0 * T23;\n        }\n        for (i=1; i<=46; i++)\n        {\n          R46 = 0.50 * R46;\n          T46 = 2.0 * T46;\n        }\n        KS = 1;\n      }\n\n/*  Break A into two parts such that A = 2^23 * A1 + A2 and set X = N.  */\n\n      T1 = R23 * *A;\n      j  = T1;\n      A1 = j;\n      A2 = *A - T23 * A1;\n\n/*  Break X into two parts such that X = 2^23 * X1 + X2, compute\n    Z = A1 * X2 + A2 * X1  (mod 2^23), and then\n    X = 2^23 * Z + A2 * X2  (mod 2^46).                            */\n\n      T1 = R23 * *X;\n      j  = T1;\n      X1 = j;\n      X2 = *X - T23 * X1;\n      T1 = A1 * X2 + A2 * X1;\n      \n      j  = R23 * T1;\n      T2 = j;\n      Z = T1 - T23 * T2;\n      T3 = T23 * Z + A2 * X2;\n      j  = R46 * T3;\n      T4 = j;\n      *X = T3 - T46 * T4;\n      return(R46 * *X);\n} \n\n\n\n\n/*****************************************************************/\n/*************      C  R  E  A  T  E  _  S  E  Q      ************/\n/*****************************************************************/\n\nvoid\tcreate_seq( double seed, double a )\n{\n\tdouble x;\n\tint    i, k;\n\n        k = MAX_KEY/4;\n\n\tfor (i=0; i<NUM_KEYS; i++)\n\t{\n\t    x = randlc(&seed, &a);\n\t    x += randlc(&seed, &a);\n    \t    x += randlc(&seed, &a);\n\t    x += randlc(&seed, &a);  \n\n            key_array[i] = k*x;\n\t}\n}\n\n\n\nvoid full_verify( void )\n{\n    INT_TYPE    i, j;\n\n\n    \n/*  Now, finally, sort the keys:  */\n\n#ifdef USE_BUCKETS\n\n    /* key_buff2[] already has the proper information, so do nothing */\n\n#else\n\n/*  Copy keys into work array; keys in key_array will be reassigned. */\n    for( i=0; i<NUM_KEYS; i++ )\n        key_buff2[i] = key_array[i];\n\n#endif\n\n    for( i=0; i<NUM_KEYS; i++ )\n        key_array[--key_buff_ptr_global[key_buff2[i]]] = key_buff2[i];\n\n\n/*  Confirm keys correctly sorted: count incorrectly sorted keys, if any */\n\n    j = 0;\n    for( i=1; i<NUM_KEYS; i++ )\n        if( key_array[i-1] > key_array[i] )\n            j++;\n\n\n    if( j != 0 )\n    {\n        printf( \"Full_verify: number of keys out of sort: %ld\\n\",\n                (long)j );\n    }\n    else\n        passed_verification++;\n           \n\n}\n\n\n\n\n/*****************************************************************/\n/*************             R  A  N  K             ****************/\n/*****************************************************************/\n\n\nvoid rank( int iteration )\n{\n\n    INT_TYPE    i, k;\n\n    INT_TYPE    *key_buff_ptr, *key_buff_ptr2;\n\n#ifdef USE_BUCKETS\n    int shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;\n    INT_TYPE    key;\n#endif\n\n\n    key_array[iteration] = iteration;\n    key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;\n\n\n/*  Determine where the partial verify test keys are, load into  */\n/*  top of array bucket_size                                     */\n    for( i=0; i<TEST_ARRAY_SIZE; i++ )\n        partial_verify_vals[i] = key_array[test_index_array[i]];\n\n#ifdef USE_BUCKETS\n\n/*  Initialize */\n    for( i=0; i<NUM_BUCKETS; i++ )  \n        bucket_size[i] = 0;\n\n/*  Determine the number of keys in each bucket */\n    for( i=0; i<NUM_KEYS; i++ )\n        bucket_size[key_array[i] >> shift]++;\n\n\n/*  Accumulative bucket sizes are the bucket pointers */\n    bucket_ptrs[0] = 0;\n    for( i=1; i< NUM_BUCKETS; i++ )  \n        bucket_ptrs[i] = bucket_ptrs[i-1] + bucket_size[i-1];\n\n\n/*  Sort into appropriate bucket */\n    for( i=0; i<NUM_KEYS; i++ )  \n    {\n        key = key_array[i];\n        key_buff2[bucket_ptrs[key >> shift]++] = key;\n    }\n\n    key_buff_ptr2 = key_buff2;\n\n#else\n\n    key_buff_ptr2 = key_array;\n\n#endif\n\n/*  Clear the work array */\n    for( i=0; i<MAX_KEY; i++ )\n        key_buff1[i] = 0;\n\n\n/*  Ranking of all keys occurs in this section:                 */\n\n    key_buff_ptr = key_buff1;\n\n/*  In this section, the keys themselves are used as their \n    own indexes to determine how many of each there are: their\n    individual population                                       */\n\n    for( i=0; i<NUM_KEYS; i++ )\n        key_buff_ptr[key_buff_ptr2[i]]++;  /* Now they have individual key   */\n                                       /* population                     */\n\n/*  To obtain ranks of each key, successively add the individual key\n    population                                                  */\n\n\n    for( i=0; i<MAX_KEY-1; i++ )   \n        key_buff_ptr[i+1] += key_buff_ptr[i];  \n\n\n/* This is the partial verify test section */\n/* Observe that test_rank_array vals are   */\n/* shifted differently for different cases */\n    for( i=0; i<TEST_ARRAY_SIZE; i++ )\n    {                                             \n        k = partial_verify_vals[i];          /* test vals were put here */\n        if( 0 < k  &&  k <= NUM_KEYS-1 )\n        {\n            INT_TYPE key_rank = key_buff_ptr[k-1];\n            int failed = 0;\n\n            switch( CLASS )\n            {\n                case 'S':\n                    if( i <= 2 )\n                    {\n                        if( key_rank != test_rank_array[i]+iteration )\n                            failed = 1;\n                        else\n                            passed_verification++;\n                    }\n                    else\n                    {\n                        if( key_rank != test_rank_array[i]-iteration )\n                            failed = 1;\n                        else\n                            passed_verification++;\n                    }\n                    break;\n                case 'W':\n                    if( i < 2 )\n                    {\n                        if( key_rank != test_rank_array[i]+(iteration-2) )\n                            failed = 1;\n                        else\n                            passed_verification++;\n                    }\n                    else\n                    {\n                        if( key_rank != test_rank_array[i]-iteration )\n                            failed = 1;\n                        else\n                            passed_verification++;\n                    }\n                    break;\n                case 'A':\n                    if( i <= 2 )\n        \t    {\n                        if( key_rank != test_rank_array[i]+(iteration-1) )\n                            failed = 1;\n                        else\n                            passed_verification++;\n        \t    }\n                    else\n                    {\n                        if( key_rank != test_rank_array[i]-(iteration-1) )\n                            failed = 1;\n                        else\n                            passed_verification++;\n                    }\n                    break;\n                case 'B':\n                    if( i == 1 || i == 2 || i == 4 )\n        \t    {\n                        if( key_rank != test_rank_array[i]+iteration )\n                            failed = 1;\n                        else\n                            passed_verification++;\n        \t    }\n                    else\n                    {\n                        if( key_rank != test_rank_array[i]-iteration )\n                            failed = 1;\n                        else\n                            passed_verification++;\n                    }\n                    break;\n                case 'C':\n                    if( i <= 2 )\n        \t    {\n                        if( key_rank != test_rank_array[i]+iteration )\n                            failed = 1;\n                        else\n                            passed_verification++;\n        \t    }\n                    else\n                    {\n                        if( key_rank != test_rank_array[i]-iteration )\n                            failed = 1;\n                        else\n                            passed_verification++;\n                    }\n                    break;\n                case 'D':\n                    if( i < 2 )\n        \t    {\n                        if( key_rank != test_rank_array[i]+iteration )\n                            failed = 1;\n                        else\n                            passed_verification++;\n        \t    }\n                    else\n                    {\n                        if( key_rank != test_rank_array[i]-iteration )\n                            failed = 1;\n                        else\n                            passed_verification++;\n                    }\n                    break;\n            }\n            if( failed == 1 )\n                printf( \"Failed partial verification: \"\n                        \"iteration %d, test key %d\\n\", \n                         iteration, (int)i );\n        }\n    }\n\n\n\n\n/*  Make copies of rank info for use by full_verify: these variables\n    in rank are local; making them global slows down the code, probably\n    since they cannot be made register by compiler                        */\n\n    if( iteration == MAX_ITERATIONS ) \n        key_buff_ptr_global = key_buff_ptr;\n\n}      \n\n\nint main( int argc, char **argv )\n{\n\n    int             i, iteration, timer_on;\n\n    double          timecounter;\n\n    FILE            *fp;\n\n\n/*  Initialize timers  */\n    timer_on = 0;            \n    if ((fp = fopen(\"timer.flag\", \"r\")) != NULL) {\n        fclose(fp);\n        timer_on = 1;\n    }\n    timer_clear( 0 );\n    if (timer_on) {\n        timer_clear( 1 );\n        timer_clear( 2 );\n        timer_clear( 3 );\n    }\n\n    if (timer_on) timer_start( 3 );\n\n\n/*  Initialize the verification arrays if a valid class */\n    for( i=0; i<TEST_ARRAY_SIZE; i++ )\n        switch( CLASS )\n        {\n            case 'S':\n                test_index_array[i] = S_test_index_array[i];\n                test_rank_array[i]  = S_test_rank_array[i];\n                break;\n            case 'A':\n                test_index_array[i] = A_test_index_array[i];\n                test_rank_array[i]  = A_test_rank_array[i];\n                break;\n            case 'W':\n                test_index_array[i] = W_test_index_array[i];\n                test_rank_array[i]  = W_test_rank_array[i];\n                break;\n            case 'B':\n                test_index_array[i] = B_test_index_array[i];\n                test_rank_array[i]  = B_test_rank_array[i];\n                break;\n            case 'C':\n                test_index_array[i] = C_test_index_array[i];\n                test_rank_array[i]  = C_test_rank_array[i];\n                break;\n            case 'D':\n                test_index_array[i] = D_test_index_array[i];\n                test_rank_array[i]  = D_test_rank_array[i];\n                break;\n        };\n\n        \n\n/*  Printout initial NPB info */\n    printf\n      ( \"\\n\\n NAS Parallel Benchmarks (NPB3.3-SER) - IS Benchmark\\n\\n\" );\n    printf( \" Size:  %ld  (class %c)\\n\", (long)TOTAL_KEYS, CLASS );\n    printf( \" Iterations:   %d\\n\", MAX_ITERATIONS );\n\n    if (timer_on) timer_start( 1 );\n\n/*  Generate random number sequence and subsequent keys on all procs */\n    create_seq( 314159265.00,                    /* Random number gen seed */\n                1220703125.00 );                 /* Random number gen mult */\n    if (timer_on) timer_stop( 1 );\n\n\n/*  Do one interation for free (i.e., untimed) to guarantee initialization of  \n    all data and code pages and respective tables */\n    rank( 1 );  \n\n/*  Start verification counter */\n    passed_verification = 0;\n\n    if( CLASS != 'S' ) printf( \"\\n   iteration\\n\" );\n\n/*  Start timer  */             \n    timer_start( 0 );\n\n\n/*  This is the main iteration */\n    for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )\n    {\n        if( CLASS != 'S' ) printf( \"        %d\\n\", iteration );\n        rank( iteration );\n    }\n\n\n/*  End of timing, obtain maximum time of all processors */\n    timer_stop( 0 );\n    timecounter = timer_read( 0 );\n\n\n/*  This tests that keys are in sequence: sorting of last ranked key seq\n    occurs here, but is an untimed operation                             */\n    if (timer_on) timer_start( 2 );\n    full_verify();\n    if (timer_on) timer_stop( 2 );\n\n    if (timer_on) timer_stop( 3 );\n\n\n/*  The final printout  */\n    if( passed_verification != 5*MAX_ITERATIONS + 1 )\n        passed_verification = 0;\n    c_print_results( \"IS\",\n                     CLASS,\n                     (int)(TOTAL_KEYS/64),\n                     64,\n                     0,\n                     MAX_ITERATIONS,\n                     timecounter,\n                     ((double) (MAX_ITERATIONS*TOTAL_KEYS))\n                                                  /timecounter/1000000.,\n                     \"keys ranked\", \n                     passed_verification,\n                     NPBVERSION,\n                     COMPILETIME,\n                     CC,\n                     CLINK,\n                     C_LIB,\n                     C_INC,\n                     CFLAGS,\n                     CLINKFLAGS );\n\n\n/*  Print additional timers  */\n    if (timer_on) {\n       double t_total, t_percent;\n\n       t_total = timer_read( 3 );\n       printf(\"\\nAdditional timers -\\n\");\n       printf(\" Total execution: %8.3f\\n\", t_total);\n       if (t_total == 0.0) t_total = 1.0;\n       timecounter = timer_read(1);\n       t_percent = timecounter/t_total * 100.;\n       printf(\" Initialization : %8.3f (%5.2f%%)\\n\", timecounter, t_percent);\n       timecounter = timer_read(0);\n       t_percent = timecounter/t_total * 100.;\n       printf(\" Benchmarking   : %8.3f (%5.2f%%)\\n\", timecounter, t_percent);\n       timecounter = timer_read(2);\n       t_percent = timecounter/t_total * 100.;\n       printf(\" Sorting        : %8.3f (%5.2f%%)\\n\", timecounter, t_percent);\n    }\n\n\n    return 0;\n\n}        \n\n\n\n\n\n", "label": 2}
{"code": "\n#include <iostream>\n#include <fstream>\n#include <cmath>\n#include <cstring>\n\nusing namespace std;\n\n\nbool timeron;\ndouble trecs[t_last];\n\n//---------------------------------------------------------------------\n//     Main function\n//---------------------------------------------------------------------\nint main()\n{\n    char class;\n    bool verified;\n    double mflops;\n    double t, tmax;\n    double timer_read();\n    int i, fstatus;\n    char t_names[t_last][8];\n\n    //---------------------------------------------------------------------\n    //     Read input data\n    //---------------------------------------------------------------------\n    read_input();\n\n    //---------------------------------------------------------------------\n    //     Set up domain sizes\n    //---------------------------------------------------------------------\n    domain();\n\n    //---------------------------------------------------------------------\n    //     Set up coefficients\n    //---------------------------------------------------------------------\n    setcoeff();\n\n    //---------------------------------------------------------------------\n    //     Set the boundary values for dependent variables\n    //---------------------------------------------------------------------\n    setbv();\n\n    //---------------------------------------------------------------------\n    //     Set the initial values for dependent variables\n    //---------------------------------------------------------------------\n    setiv();\n\n    //---------------------------------------------------------------------\n    //     Compute the forcing term based on prescribed exact solution\n    //---------------------------------------------------------------------\n    erhs();\n\n    //---------------------------------------------------------------------\n    //     Perform one SSOR iteration to touch all pages\n    //---------------------------------------------------------------------\n    ssor(1);\n\n    //---------------------------------------------------------------------\n    //     Reset the boundary and initial values\n    //---------------------------------------------------------------------\n    setbv();\n    setiv();\n\n    //---------------------------------------------------------------------\n    //     Perform the SSOR iterations\n    //---------------------------------------------------------------------\n    ssor(itmax);\n\n    //---------------------------------------------------------------------\n    //     Compute the solution error\n    //---------------------------------------------------------------------\n    error();\n\n    //---------------------------------------------------------------------\n    //     Compute the surface integral\n    //---------------------------------------------------------------------\n    pintgr();\n\n    //---------------------------------------------------------------------\n    //     Verification test\n    //---------------------------------------------------------------------\n    verify(rsdnm, errnm, frc, class, verified);\n    mflops = float(itmax) * (1984.77 * float(nx0) * float(ny0) * float(nz0) - 10923.3 * (float(nx0 + ny0 + nz0) / 3.) * (float(nx0 + ny0 + nz0) / 3.) + 27770.9 * float(nx0 + ny0 + nz0) / 3. - 144010.) / (maxtime * 1000000.);\n    print_results(\"LU\", class, nx0, ny0, nz0, itmax, maxtime, mflops, \"          floating point\", verified, npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, \"(none)\");\n\n    //---------------------------------------------------------------------\n    //     More timers\n    //---------------------------------------------------------------------\n    if (!timeron)\n        goto L999;\n    for (i = 0; i < t_last; i++)\n    {\n        trecs[i] = timer_read(i);\n    }\n    tmax = maxtime;\n    if (tmax == 0.)\n        tmax = 1.0;\n    cout << \"  SECTION     Time (secs)\" << endl;\n    for (i = 0; i < t_last; i++)\n    {\n        cout << \"  \" << t_names[i] << \": \" << trecs[i] << \"  (\" << trecs[i] * 100. / tmax << \"%)\" << endl;\n        if (i == t_rhs)\n        {\n            t = trecs[t_rhsx] + trecs[t_rhsy] + trecs[t_rhsz];\n            cout << \"     --> total rhs: \" << t << \"  (\" << t * 100. / tmax << \"%)\" << endl;\n            t = trecs[i] - t;\n            cout << \"     --> rest rhs: \" << t << \"  (\" << t * 100. / tmax << \"%)\" << endl;\n        }\n    }\nL999:\n    return 0;\n}\n\n\n", "label": 1}
{"code": "#include <iostream>\n#include <fstream>\n#include <string>\n#include <cmath>\n#include <vector>\n#include <cstdlib>\n#include <omp.h>\n\n#define MAX_ARGS 10\n#define REC_LENGTH 49   // size of a record in db\n#define REC_WINDOW 10   // number of records to read at a time\n#define LATITUDE_POS 28 // location of latitude coordinates in input record\n#define OPEN 10000      // initial value of nearest neighbors\n\nstruct neighbor {\n    std::string entry;\n    double dist;\n};\n\nint main(int argc, char *argv[]) {\n    std::ifstream flist, fp;\n    int i = 0, j = 0, k = 0, rec_count = 0, done = 0;\n    char sandbox[REC_LENGTH * REC_WINDOW];\n    std::string dbname;\n    std::vector<neighbor> neighbors;\n    float target_lat, target_long, tmp_lat = 0, tmp_long = 0;\n\n    if (argc < 5) {\n        std::cerr << \"Invalid set of arguments\\n\";\n        exit(-1);\n    }\n\n    flist.open(argv[1]);\n    if (!flist.is_open()) {\n        std::cout << \"error opening flist\\n\";\n        exit(1);\n    }\n\n    k = std::atoi(argv[2]);\n    target_lat = std::atof(argv[3]);\n    target_long = std::atof(argv[4]);\n    neighbors.resize(k);\n\n    for (j = 0; j < k; j++) {\n        neighbors[j].dist = OPEN;\n    }\n\n    if (!(flist >> dbname)) {\n        std::cerr << \"error reading filelist\\n\";\n        exit(0);\n    }\n\n    fp.open(dbname);\n    if (!fp.is_open()) {\n        std::cout << \"error opening flist\\n\";\n        exit(1);\n    }\n\n    std::vector<float> z(REC_WINDOW);\n\n    while (!done) {\n        fp.read(sandbox, REC_LENGTH * REC_WINDOW);\n        rec_count = fp.gcount() / REC_LENGTH;\n        if (rec_count != REC_WINDOW) {\n            if (!fp.bad()) {\n                fp.close();\n                if (flist.eof())\n                    done = 1;\n                else {\n                    if (!(flist >> dbname)) {\n                        std::cerr << \"error reading filelist\\n\";\n                        exit(0);\n                    }\n                    fp.open(dbname);\n                    if (!fp.is_open()) {\n                        std::cout << \"error opening a db\\n\";\n                        exit(1);\n                    }\n                }\n            } else {\n                perror(\"Error\");\n                exit(0);\n            }\n        }\n\n#pragma omp parallel for shared(z, target_lat, target_long) private(i, tmp_lat, tmp_long)\n        for (i = 0; i < rec_count; i++) {\n            std::sscanf(sandbox + (i * REC_LENGTH + LATITUDE_POS - 1), \"%f %f\", &tmp_lat, &tmp_long);\n            z[i] = std::sqrt(((tmp_lat - target_lat) * (tmp_lat - target_lat)) +\n                             ((tmp_long - target_long) * (tmp_long - target_long)));\n        }\n\n#pragma omp barrier\n        for (i = 0; i < rec_count; i++) {\n            float max_dist = -1;\n            int max_idx = 0;\n            for (j = 0; j < k; j++) {\n                if (neighbors[j].dist > max_dist) {\n                    max_dist = neighbors[j].dist;\n                    max_idx = j;\n                }\n            }\n            if (z[i] < neighbors[max_idx].dist) {\n                sandbox[(i + 1) * REC_LENGTH - 1] = '\\0';\n                neighbors[max_idx].entry = std::string(sandbox + i * REC_LENGTH);\n                neighbors[max_idx].dist = z[i];\n            }\n        }\n    }\n\n    if (getenv(\"OUTPUT\")) {\n        std::ofstream out(\"output.txt\");\n        out << \"The \" << k << \" nearest neighbors are:\\n\";\n        for (j = k - 1; j >= 0; j--) {\n            if (!(neighbors[j].dist == OPEN))\n                out << neighbors[j].entry << \" --> \" << neighbors[j].dist << \"\\n\";\n        }\n    }\n\n    flist.close();\n    return 0;\n}\n\n\n", "label": 1}
{"code": "#include <stdio.h>\n#include <stdlib.h>\n#include <math.h>\n#include <string.h>\n#include <omp.h>\n#include \"define.c\"\n#include \"ecc.c\"\n#include \"cam.c\"\n#include \"fin.c\"\n#include \"master.c\"\n#include \"embedded_fehlberg_7_8.c\"\n#include \"solver.c\"\n#include \"file.c\"\n#include \"timer.c\"\n//====================================================================================================100\n//\tMAIN FUNCTION\n//====================================================================================================100\nint main(int argc, char *argv[]) {\n    //================================================================================80\n    //\t\tVARIABLES\n    //================================================================================80\n    //============================================================60\n    //\t\tTIME\n    //============================================================60\n    long long time0;\n    long long time1;\n    long long time2;\n    long long time3;\n    long long time4;\n    long long time5;\n    time0 = get_time();\n    //============================================================60\n    //\t\tCOUNTERS\n    //============================================================60\n    long long memory;\n    int i, j;\n    int status;\n    int mode;\n    //============================================================60\n    //\t\tSOLVER PARAMETERS\n    //============================================================60\n    long workload;\n    long xmin;\n    long xmax;\n    fp h;\n    fp tolerance;\n    //============================================================60\n    //\t\tDATA\n    //============================================================60\n    fp ***y;\n    fp **x;\n    fp **params;\n    //============================================================60\n    //\t\tOPENMP\n    //============================================================60\n    int threads;\n    //================================================================================80\n    // \tGET INPUT PARAMETERS\n    //================================================================================80\n    //============================================================60\n    //\t\tCHECK NUMBER OF ARGUMENTS\n    //============================================================60\n    if (argc != 5) {\n        printf(\"ERROR: %d is the incorrect number of arguments, the number of \"\n               \"arguments must be 4\\n\",\n               argc - 1);\n        return 0;\n    }\n    //============================================================60\n    //\t\tGET AND CHECK PARTICULAR ARGUMENTS\n    //============================================================60\n    else {\n        //========================================40\n        //\t\tSPAN\n        //========================================40\n        xmax = atoi(argv[1]);\n        if (xmax < 0) {\n            printf(\"ERROR: %d is the incorrect end of simulation interval, use \"\n                   \"numbers > 0\\n\",\n                   xmax);\n            return 0;\n        }\n        //========================================40\n        //\t\tWORKLOAD\n        //========================================40\n        workload = atoi(argv[2]);\n        if (workload < 0) {\n            printf(\"ERROR: %d is the incorrect number of instances of \"\n                   \"simulation, use numbers > 0\\n\",\n                   workload);\n            return 0;\n        }\n        //========================================40\n        //\t\tMODE\n        //========================================40\n        mode = 0;\n        mode = atoi(argv[3]);\n        if (mode != 0 && mode != 1) {\n            printf(\"ERROR: %d is the incorrect mode, it should be omitted or \"\n                   \"equal to 0 or 1\\n\",\n                   mode);\n            return 0;\n        }\n        //========================================40\n        //\t\tTHREADS\n        //========================================40\n        threads = atoi(argv[4]);\n        if (threads < 0) {\n            printf(\"ERROR: %d is the incorrect number of threads, use numbers \"\n                   \"> 0\\n\",\n                   threads);\n            return 0;\n        }\n        omp_set_num_threads(threads);\n    }\n    time1 = get_time();\n    //================================================================================80\n    // \tALLOCATE MEMORY\n    //================================================================================80\n    //============================================================60\n    //\t\tMEMORY CHECK\n    //============================================================60\n    memory = workload * (xmax + 1) * EQUATIONS * 4;\n    if (memory > 1000000000) {\n        printf(\"ERROR: trying to allocate more than 1.0GB of memory, decrease \"\n               \"workload and span parameters or change memory parameter\\n\");\n        return 0;\n    }\n    //============================================================60\n    // \tALLOCATE ARRAYS\n    //============================================================60\n    y = (fp ***)malloc(workload * sizeof(fp **));\n    for (i = 0; i < workload; i++) {\n        y[i] = (fp **)malloc((1 + xmax) * sizeof(fp *));\n        for (j = 0; j < (1 + xmax); j++) {\n            y[i][j] = (fp *)malloc(EQUATIONS * sizeof(fp));\n        }\n    }\n    x = (fp **)malloc(workload * sizeof(fp *));\n    for (i = 0; i < workload; i++) {\n        x[i] = (fp *)malloc((1 + xmax) * sizeof(fp));\n    }\n    params = (fp **)malloc(workload * sizeof(fp *));\n    for (i = 0; i < workload; i++) {\n        params[i] = (fp *)malloc(PARAMETERS * sizeof(fp));\n    }\n    time2 = get_time();\n    //================================================================================80\n    // \tINITIAL VALUES\n    //================================================================================80\n    // y\n    for (i = 0; i < workload; i++) {\n        read(\"../../data/myocyte/y.txt\", y[i][0], 91, 1, 0);\n    }\n    // params\n    for (i = 0; i < workload; i++) {\n        read(\"../../data/myocyte/params.txt\", params[i], 16, 1, 0);\n    }\n    time3 = get_time();\n    //================================================================================80\n    //\tEXECUTION\n    //================================================================================80\n    if (mode == 0) {\n        for (i = 0; i < workload; i++) {\n            status = solver(y[i], x[i], xmax, params[i], mode);\n            // if(status !=0){\n            // printf(\"STATUS: %d\\n\", status);\n            // }\n        }\n    } else {\n#pragma omp parallel for private(i, status) shared(y, x, xmax, params, mode)\n        for (i = 0; i < workload; i++) {\n            status = solver(y[i], x[i], xmax, params[i], mode);\n            // if(status !=0){\n            // printf(\"STATUS: %d\\n\", status);\n            // }\n        }\n    }\n    // // print results\n    // int k;\n    // for(i=0; i<workload; i++){\n    // printf(\"WORKLOAD %d:\\n\", i);\n    // for(j=0; j<(xmax+1); j++){\n    // printf(\"\\tTIME %d:\\n\", j);\n    // for(k=0; k<EQUATIONS; k++){\n    // printf(\"\\t\\ty[%d][%d][%d]=%13.10f\\n\", i, j, k, y[i][j][k]);\n    // }\n    // }\n    // }\n    time4 = get_time();\n    //================================================================================80\n    //\tDEALLOCATION\n    //================================================================================80\n    // y values\n    for (i = 0; i < workload; i++) {\n        for (j = 0; j < (1 + xmax); j++) {\n            free(y[i][j]);\n        }\n        free(y[i]);\n    }\n    free(y);\n    // x values\n    for (i = 0; i < workload; i++) {\n        free(x[i]);\n    }\n    free(x);\n    // parameters\n    for (i = 0; i < workload; i++) {\n        free(params[i]);\n    }\n    free(params);\n    time5 = get_time();\n    //================================================================================80\n    //\t\tDISPLAY TIMING\n    //================================================================================80\n    printf(\"Time spent in different stages of the application:\\n\");\n    printf(\"%.12f s, %.12f % : SETUP VARIABLES, READ COMMAND LINE ARGUMENTS\\n\",\n           (float)(time1 - time0) / 1000000,\n           (float)(time1 - time0) / (float)(time5 - time0) * 100);\n    printf(\"%.12f s, %.12f % : ALLOCATE MEMORY\\n\",\n           (float)(time2 - time1) / 1000000,\n           (float)(time2 - time1) / (float)(time5 - time0) * 100);\n    printf(\"%.12f s, %.12f % : READ DATA FROM FILES\\n\",\n           (float)(time3 - time2) / 1000000,\n           (float)(time3 - time2) / (float)(time5 - time0) * 100);\n    printf(\"%.12f s, %.12f % : RUN COMPUTATION\\n\",\n           (float)(time4 - time3) / 1000000,\n           (float)(time4 - time3) / (float)(time5 - time0) * 100);\n    printf(\"%.12f s, %.12f % : FREE MEMORY\\n\", (float)(time5 - time4) / 1000000,\n           (float)(time5 - time4) / (float)(time5 - time0) * 100);\n    printf(\"Total time:\\n\");\n    printf(\"%.12f s\\n\", (float)(time5 - time0) / 1000000);\n    //====================================================================================================100\n    //\tEND OF FILE\n    //====================================================================================================100\n}\n\n\n", "label": 2}
{"code": "/*\n ******************************************************************\n * HISTORY\n * 15-Oct-94  Jeff Shufelt (js), Carnegie Mellon University\n *\tPrepared for 15-681, Fall 1994.\n * Modified by Shuai Che\n ******************************************************************\n */\n#include <omp.h>\n#include <stdio.h>\n#include <unistd.h>\n#include <fcntl.h>\n#include <stdlib.h>\n#include \"backprop.h\"\n#include <math.h>\n#define ABS(x) (((x) > 0.0) ? (x) : (-(x)))\n#define fastcopy(to, from, len)                                                \\\n    {                                                                          \\\n        register char *_to, *_from;                                            \\\n        register int _i, _l;                                                   \\\n        _to = (char *)(to);                                                    \\\n        _from = (char *)(from);                                                \\\n        _l = (len);                                                            \\\n        for (_i = 0; _i < _l; _i++)                                            \\\n            *_to++ = *_from++;                                                 \\\n    }\n/*** Return random number between 0.0 and 1.0 ***/\nfloat drnd() { return ((float)rand() / (float)BIGRND); }\n/*** Return random number between -1.0 and 1.0 ***/\nfloat dpn1() { return ((drnd() * 2.0) - 1.0); }\n/*** The squashing function.  Currently, it's a sigmoid. ***/\nfloat squash(float x) {\n    float m;\n    // x = -x;\n    // m = 1 + x + x*x/2 + x*x*x/6 + x*x*x*x/24 + x*x*x*x*x/120;\n    // return(1.0 / (1.0 + m));\n    return (1.0 / (1.0 + exp(-x)));\n}\n/*** Allocate 1d array of floats ***/\nfloat *alloc_1d_dbl(int n) {\n    float *new;\n    new = (float *)malloc((unsigned)(n * sizeof(float)));\n    if (new == NULL) {\n        printf(\"ALLOC_1D_DBL: Couldn't allocate array of floats\\n\");\n        return (NULL);\n    }\n    return (new);\n}\n/*** Allocate 2d array of floats ***/\nfloat **alloc_2d_dbl(int m, int n) {\n    int i;\n    float **new;\n    new = (float **)malloc((unsigned)(m * sizeof(float *)));\n    if (new == NULL) {\n        printf(\"ALLOC_2D_DBL: Couldn't allocate array of dbl ptrs\\n\");\n        return (NULL);\n    }\n    for (i = 0; i < m; i++) {\n        new[i] = alloc_1d_dbl(n);\n    }\n    return (new);\n}\nvoid bpnn_randomize_weights(float **w, int m, int n) {\n    int i, j;\n    for (i = 0; i <= m; i++) {\n        for (j = 0; j <= n; j++) {\n            w[i][j] = (float)rand() / RAND_MAX;\n            //  w[i][j] = dpn1();\n        }\n    }\n}\nvoid bpnn_randomize_row(float *w, int m) {\n    int i;\n    for (i = 0; i <= m; i++) {\n        // w[i] = (float) rand()/RAND_MAX;\n        w[i] = 0.1;\n    }\n}\nvoid bpnn_zero_weights(float **w, int m, int n) {\n    int i, j;\n    for (i = 0; i <= m; i++) {\n        for (j = 0; j <= n; j++) {\n            w[i][j] = 0.0;\n        }\n    }\n}\nvoid bpnn_initialize(int seed) {\n    printf(\"Random number generator seed: %d\\n\", seed);\n    srand(seed);\n}\nBPNN *bpnn_internal_create(int n_in, int n_hidden, int n_out) {\n    BPNN *newnet;\n    newnet = (BPNN *)malloc(sizeof(BPNN));\n    if (newnet == NULL) {\n        printf(\"BPNN_CREATE: Couldn't allocate neural network\\n\");\n        return (NULL);\n    }\n    newnet->input_n = n_in;\n    newnet->hidden_n = n_hidden;\n    newnet->output_n = n_out;\n    newnet->input_units = alloc_1d_dbl(n_in + 1);\n    newnet->hidden_units = alloc_1d_dbl(n_hidden + 1);\n    newnet->output_units = alloc_1d_dbl(n_out + 1);\n    newnet->hidden_delta = alloc_1d_dbl(n_hidden + 1);\n    newnet->output_delta = alloc_1d_dbl(n_out + 1);\n    newnet->target = alloc_1d_dbl(n_out + 1);\n    newnet->input_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);\n    newnet->hidden_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);\n    newnet->input_prev_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);\n    newnet->hidden_prev_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);\n    return (newnet);\n}\nvoid bpnn_free(BPNN *net) {\n    int n1, n2, i;\n    n1 = net->input_n;\n    n2 = net->hidden_n;\n    free((char *)net->input_units);\n    free((char *)net->hidden_units);\n    free((char *)net->output_units);\n    free((char *)net->hidden_delta);\n    free((char *)net->output_delta);\n    free((char *)net->target);\n    for (i = 0; i <= n1; i++) {\n        free((char *)net->input_weights[i]);\n        free((char *)net->input_prev_weights[i]);\n    }\n    free((char *)net->input_weights);\n    free((char *)net->input_prev_weights);\n    for (i = 0; i <= n2; i++) {\n        free((char *)net->hidden_weights[i]);\n        free((char *)net->hidden_prev_weights[i]);\n    }\n    free((char *)net->hidden_weights);\n    free((char *)net->hidden_prev_weights);\n    free((char *)net);\n}\n/*** Creates a new fully-connected network from scratch,\n     with the given numbers of input, hidden, and output units.\n     Threshold units are automatically included.  All weights are\n     randomly initialized.\n     Space is also allocated for temporary storage (momentum weights,\n     error computations, etc).\n***/\nBPNN *bpnn_create(int n_in, int n_hidden, int n_out) {\n    BPNN *newnet;\n    newnet = bpnn_internal_create(n_in, n_hidden, n_out);\n#ifdef INITZERO\n    bpnn_zero_weights(newnet->input_weights, n_in, n_hidden);\n#else\n    bpnn_randomize_weights(newnet->input_weights, n_in, n_hidden);\n#endif\n    bpnn_randomize_weights(newnet->hidden_weights, n_hidden, n_out);\n    bpnn_zero_weights(newnet->input_prev_weights, n_in, n_hidden);\n    bpnn_zero_weights(newnet->hidden_prev_weights, n_hidden, n_out);\n    bpnn_randomize_row(newnet->target, n_out);\n    return (newnet);\n}\nvoid bpnn_layerforward(float *l1, float *l2, float **conn, int n1, int n2) {\n    float sum;\n    int j, k;\n    /*** Set up thresholding unit ***/\n    l1[0] = 1.0;\n    omp_set_num_threads(NUM_THREAD);\n#pragma omp parallel for shared(conn, n1, n2, l1) private(k, j) reduction(     \\\n    + : sum) schedule(static)\n    /*** For each unit in second layer ***/\n    for (j = 1; j <= n2; j++) {\n        /*** Compute weighted sum of its inputs ***/\n        sum = 0.0;\n        for (k = 0; k <= n1; k++) {\n            sum += conn[k][j] * l1[k];\n        }\n        l2[j] = squash(sum);\n    }\n}\n// extern \"C\"\nvoid bpnn_output_error(float *delta, float *target, float *output, int nj,\n                       float *err) {\n    int j;\n    float o, t, errsum;\n    errsum = 0.0;\n    for (j = 1; j <= nj; j++) {\n        o = output[j];\n        t = target[j];\n        delta[j] = o * (1.0 - o) * (t - o);\n        errsum += ABS(delta[j]);\n    }\n    *err = errsum;\n}\nvoid bpnn_hidden_error(float *delta_h, int nh, float *delta_o, int no,\n                       float **who, float *hidden, float *err) {\n    int j, k;\n    float h, sum, errsum;\n    errsum = 0.0;\n    for (j = 1; j <= nh; j++) {\n        h = hidden[j];\n        sum = 0.0;\n        for (k = 1; k <= no; k++) {\n            sum += delta_o[k] * who[j][k];\n        }\n        delta_h[j] = h * (1.0 - h) * sum;\n        errsum += ABS(delta_h[j]);\n    }\n    *err = errsum;\n}\nvoid bpnn_adjust_weights(float *delta, int ndelta, float *ly, int nly, float **w,\n                         float **oldw) {\n    float new_dw;\n    int k, j;\n    ly[0] = 1.0;\n    // eta = 0.3;\n    // momentum = 0.3;\n    omp_set_num_threads(NUM_THREAD);\n#pragma omp parallel for shared(oldw, w, delta) private(                       \\\n    j, k, new_dw) firstprivate(ndelta, nly)\n    for (j = 1; j <= ndelta; j++) {\n        for (k = 0; k <= nly; k++) {\n            new_dw = ((ETA * delta[j] * ly[k]) + (MOMENTUM * oldw[k][j]));\n            w[k][j] += new_dw;\n            oldw[k][j] = new_dw;\n        }\n    }\n}\nvoid bpnn_feedforward(BPNN *net) {\n    int in, hid, out;\n    in = net->input_n;\n    hid = net->hidden_n;\n    out = net->output_n;\n    /*** Feed forward input activations. ***/\n    bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights,\n                      in, hid);\n    bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,\n                      hid, out);\n}\nvoid bpnn_train(BPNN *net, float *eo, float *eh) {\n    int in, hid, out;\n    float out_err, hid_err;\n    in = net->input_n;\n    hid = net->hidden_n;\n    out = net->output_n;\n    /*** Feed forward input activations. ***/\n    bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights,\n                      in, hid);\n    bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,\n                      hid, out);\n    /*** Compute error on output and hidden units. ***/\n    bpnn_output_error(net->output_delta, net->target, net->output_units, out,\n                      &out_err);\n    bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,\n                      net->hidden_weights, net->hidden_units, &hid_err);\n    *eo = out_err;\n    *eh = hid_err;\n    /*** Adjust input and hidden weights. ***/\n    bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,\n                        net->hidden_weights, net->hidden_prev_weights);\n    bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,\n                        net->input_weights, net->input_prev_weights);\n}\nvoid bpnn_save(BPNN *net, char *filename) {\n    int n1, n2, n3, i, j, memcnt;\n    float dvalue, **w;\n    char *mem;\n    FILE *pFile;\n    pFile = fopen(filename, \"w+\");\n    n1 = net->input_n;\n    n2 = net->hidden_n;\n    n3 = net->output_n;\n    printf(\"Saving %dx%dx%d network to '%s'\\n\", n1, n2, n3, filename);\n    fwrite((int *)&n1, sizeof(int), 1, pFile);\n    fwrite((int *)&n2, sizeof(int), 1, pFile);\n    fwrite((int *)&n3, sizeof(int), 1, pFile);\n    memcnt = 0;\n    w = net->input_weights;\n    mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));\n    for (i = 0; i <= n1; i++) {\n        for (j = 0; j <= n2; j++) {\n            dvalue = w[i][j];\n            fastcopy(&mem[memcnt], &dvalue, sizeof(float));\n            memcnt += sizeof(float);\n        }\n    }\n    fwrite(mem, (unsigned)(sizeof(float)), (unsigned)((n1 + 1) * (n2 + 1)),\n           pFile);\n    free(mem);\n    memcnt = 0;\n    w = net->hidden_weights;\n    mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));\n    for (i = 0; i <= n2; i++) {\n        for (j = 0; j <= n3; j++) {\n            dvalue = w[i][j];\n            fastcopy(&mem[memcnt], &dvalue, sizeof(float));\n            memcnt += sizeof(float);\n        }\n    }\n    fwrite(mem, sizeof(float), (unsigned)((n2 + 1) * (n3 + 1)), pFile);\n    free(mem);\n    fclose(pFile);\n    return;\n}\nBPNN *bpnn_read(char *filename) {\n    char *mem;\n    BPNN *new;\n    int fd, n1, n2, n3, i, j, memcnt;\n    if ((fd = open(filename, 0, 0644)) == -1) {\n        return (NULL);\n    }\n    printf(\"Reading '%s'\\n\", filename);\n    read(fd, (int *)&n1, sizeof(int));\n    read(fd, (int *)&n2, sizeof(int));\n    read(fd, (int *)&n3, sizeof(int));\n    new = bpnn_internal_create(n1, n2, n3);\n    printf(\"'%s' contains a %dx%dx%d network\\n\", filename, n1, n2, n3);\n    printf(\"Reading input weights...\");\n    memcnt = 0;\n    mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));\n    read(fd, mem, (n1 + 1) * (n2 + 1) * sizeof(float));\n    for (i = 0; i <= n1; i++) {\n        for (j = 0; j <= n2; j++) {\n            fastcopy(&(new->input_weights[i][j]), &mem[memcnt], sizeof(float));\n            memcnt += sizeof(float);\n        }\n    }\n    free(mem);\n    printf(\"Done\\nReading hidden weights...\");\n    memcnt = 0;\n    mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));\n    read(fd, mem, (n2 + 1) * (n3 + 1) * sizeof(float));\n    for (i = 0; i <= n2; i++) {\n        for (j = 0; j <= n3; j++) {\n            fastcopy(&(new->hidden_weights[i][j]), &mem[memcnt], sizeof(float));\n            memcnt += sizeof(float);\n        }\n    }\n    free(mem);\n    close(fd);\n    printf(\"Done\\n\");\n    bpnn_zero_weights(new->input_prev_weights, n1, n2);\n    bpnn_zero_weights(new->hidden_prev_weights, n2, n3);\n    return (new);\n}\n\n\n", "label": 2}
{"code": "#include <cmath>\n#include <iostream>\n#include <fstream>\n#include <string>\n\n#define T_last 3\n#define naa na\n#define nzz nz\n\nvoid timer_clear(int i) {}\nvoid timer_start(int i) {}\nvoid timer_stop(int i) {}\ndouble timer_read(int i) { return 0.0; }\n\nvoid print_results(std::string bmname, char clss, int na, int nonzer, int niter, double t, double mflops, std::string optype, bool verified, std::string npbversion, std::string compiletime, std::string cs1, std::string cs2, std::string cs3, std::string cs4, std::string cs5, std::string cs6, std::string cs7) {\n    std::cout << \" Benchmark completed \" << std::endl;\n}\n\nvoid conj_grad(int colidx[], int rowstr[], double x[], double z[], double a[], double p[], double q[], double r[], double& rnorm) {\n    int cgit, cgitmax;\n    double d, sum, rho, rho0, alpha, beta;\n    cgitmax = 25;\n    rho = 0.0;\n    for (int j = 1; j <= naa + 1; j++) {\n        q[j] = 0.0;\n        z[j] = 0.0;\n        r[j] = x[j];\n        p[j] = r[j];\n    }\n    for (cgit = 1; cgit <= cgitmax; cgit++) {\n        sum = 0.0;\n        for (int j = 1; j <= lastrow - firstrow + 1; j++) {\n            sum = sum + r[j] * r[j];\n        }\n        rho = sum;\n        for (int j = 1; j <= lastrow - firstrow + 1; j++) {\n            sum = 0.0;\n            for (int k = rowstr[j]; k <= rowstr[j + 1] - 1; k++) {\n                sum = sum + a[k] * p[colidx[k]];\n            }\n            q[j] = sum;\n        }\n        d = 0.0;\n        for (int j = 1; j <= lastcol - firstcol + 1; j++) {\n            d = d + p[j] * q[j];\n        }\n        alpha = rho / d;\n        rho0 = rho;\n        rho = 0.0;\n        for (int j = 1; j <= lastcol - firstcol + 1; j++) {\n            z[j] = z[j] + alpha * p[j];\n            r[j] = r[j] - alpha * q[j];\n            rho = rho + r[j] * r[j];\n        }\n        beta = rho / rho0;\n        for (int j = 1; j <= lastcol - firstcol + 1; j++) {\n            p[j] = r[j] + beta * p[j];\n        }\n    }\n    sum = 0.0;\n    for (int j = 1; j <= lastrow - firstrow + 1; j++) {\n        double d = 0.0;\n        for (int k = rowstr[j]; k <= rowstr[j + 1] - 1; k++) {\n            d = d + a[k] * z[colidx[k]];\n        }\n        r[j] = d;\n    }\n    for (int j = 1; j <= lastcol - firstcol + 1; j++) {\n        double d = x[j] - r[j];\n        sum = sum + d * d;\n    }\n    rnorm = sqrt(sum);\n}\n\nvoid makea(int n, int nz, double a[], int colidx[], int rowstr[], int firstrow, int lastrow, int firstcol, int lastcol, int arow[], int acol[][n], double aelt[][n], int iv[]) {\n    double rcond, shift;\n    for (int j = 1; j <= nz; j++) {\n        a[j] = 0.0;\n    }\n    for (int j = 1; j <= n; j++) {\n        iv[j] = 0;\n        rowstr[j] = 0;\n    }\n    for (int i = 1; i <= n + 1; i++) {\n        colidx[i] = 0;\n    }\n    int iouter = 0;\n    for (int i = 1; i <= n; i++) {\n        int nnza = nonzer;\n        sprnvc(n, nnza, v, iv, &(colidx[1]), &(a[1]), &(arow[1]), &(acol[1][1]), &(aelt[1][1]), firstrow, lastrow, firstcol, lastcol, rcond, shift);\n        vecset(n, v, iv, &(colidx[1]), &(a[1]), &(arow[1]), &(acol[1][1]), &(aelt[1][1]), firstrow, lastrow, firstcol, lastcol, i, 0.5);\n        for (int j = rowstr[i]; j <= rowstr[i + 1] - 1; j++) {\n            a[j] = a[j] * rcond;\n        }\n    }\n    sparse(a, colidx, rowstr, n, nz, &(arow[1]), &(acol[1][1]), &(aelt[1][1]), firstrow, lastrow, v, &(iv[1]), &(a[1]), rcond, shift);\n}\n\nint main() {\n    int colidx[nz], rowstr[na + 1], iv[na], arow[na], acol[nz], aelt[nz];\n    double a[nz], x[na + 2], z[na + 2], p[na + 2], q[na + 2], r[na + 2];\n    int i, j, k, it;\n    double zeta, randlc;\n    double rnorm;\n    double norm_temp1, norm_temp2;\n    double t, mflops, tmax;\n    char class;\n    bool verified;\n    double zeta_verify_value, epsilon, err;\n    int fstatus;\n    std::string t_names[T_last];\n    for (i = 1; i <= T_last; i++) {\n        timer_clear(i);\n    }\n    std::ifstream timer_file(\"timer.flag\");\n    if (timer_file.good()) {\n        timeron = true;\n        t_names[t_init] = \"init\";\n        t_names[t_bench] = \"benchmk\";\n        t_names[t_conj_grad] = \"conjgd\";\n        timer_file.close();\n    } else {\n        timeron = false;\n    }\n    timer_start(T_init);\n    firstrow = 1;\n    lastrow = na;\n    firstcol = 1;\n    lastcol = na;\n    if (na == 1400 && nonzer == 7 && niter == 15 && shift == 10.0) {\n        class = 'S';\n        zeta_verify_value = 8.5971775078648;\n    } else if (na == 7000 && nonzer == 8 && niter == 15 && shift == 12.0) {\n        class = 'W';\n        zeta_verify_value = 10.362595087124;\n    } else if (na == 14000 && nonzer == 11 && niter == 15 && shift == 20.0) {\n        class = 'A';\n        zeta_verify_value = 17.130235054029;\n    } else if (na == 75000 && nonzer == 13 && niter == 75 && shift == 60.0) {\n        class = 'B';\n        zeta_verify_value = 22.712745482631;\n    } else if (na == 150000 && nonzer == 15 && niter == 75 && shift == 110.0) {\n        class = 'C';\n        zeta_verify_value = 28.973605592845;\n    } else if (na == 1500000 && nonzer == 21 && niter == 100 && shift == 500.0) {\n        class = 'D';\n        zeta_verify_value = 52.514532105794;\n    } else if (na == 9000000 && nonzer == 26 && niter == 100 && shift == 1500.0) {\n        class = 'E';\n        zeta_verify_value = 77.522164599383;\n    } else {\n        class = 'U';\n    }\n    std::cout << std::endl;\n    std::cout << \" NAS Parallel Benchmarks (NPB3.3-SER) - CG Benchmark\" << std::endl;\n    std::cout << \" Size: \" << na << std::endl;\n    std::cout << \" Iterations: \" << niter << std::endl;\n    std::cout << std::endl;\n    naa = na;\n    nzz = nz;\n    tran = 314159265.0;\n    amult = 1220703125.0;\n    zeta = randlc(tran, amult);\n    makea(naa, nzz, a, colidx, rowstr, firstrow, lastrow, firstcol, lastcol, arow, acol, aelt, iv);\n    for (j = 1; j <= lastrow - firstrow + 1; j++) {\n        for (k = rowstr[j]; k <= rowstr[j + 1] - 1; k++) {\n            colidx[k] = colidx[k] - firstcol + 1;\n        }\n    }\n    for (i = 1; i <= na + 1; i++) {\n        x[i] = 1.0;\n    }\n    for (j = 1; j <= lastcol - firstcol + 1; j++) {\n        q[j] = 0.0;\n        z[j] = 0.0;\n        r[j] = 0.0;\n        p[j] = 0.0;\n    }\n    zeta = 0.0;\n    for (it = 1; it <= 1; it++) {\n        conj_grad(colidx, rowstr, x, z, a, p, q, r, rnorm);\n        norm_temp1 = 0.0;\n        norm_temp2 = 0.0;\n        for (j = 1; j <= lastcol - firstcol + 1; j++) {\n            norm_temp1 = norm_temp1 + x[j] * z[j];\n            norm_temp2 = norm_temp2 + z[j] * z[j];\n        }\n        norm_temp2 = 1.0 / sqrt(norm_temp2);\n        for (j = 1; j <= lastcol - firstcol + 1; j++) {\n            x[j] = norm_temp2 * z[j];\n        }\n    }\n    for (i = 1; i <= na + 1; i++) {\n        x[i] = 1.0;\n    }\n    zeta = 0.0;\n    timer_stop(T_init);\n    std::cout << \" Initialization time = \" << timer_read(T_init) << \" seconds\" << std::endl;\n    timer_start(T_bench);\n    for (it = 1; it <= niter; it++) {\n        if (timeron) {\n            timer_start(T_conj_grad);\n        }\n        conj_grad(colidx, rowstr, x, z, a, p, q, r, rnorm);\n        if (timeron) {\n            timer_stop(T_conj_grad);\n        }\n        norm_temp1 = 0.0;\n        norm_temp2 = 0.0;\n        for (j = 1; j <= lastcol - firstcol + 1; j++) {\n            norm_temp1 = norm_temp1 + x[j] * z[j];\n            norm_temp2 = norm_temp2 + z[j] * z[j];\n        }\n        norm_temp2 = 1.0 / sqrt(norm_temp2);\n        zeta = shift + 1.0 / norm_temp1;\n        if (it == 1) {\n            std::cout << std::endl;\n            std::cout << \"   iteration           ||r||                 zeta\" << std::endl;\n        }\n        std::cout << \"    \" << std::setw(5) << it << \"       \" << std::scientific << std::setw(20) << rnorm << \"  \" << std::fixed << std::setw(20) << zeta << std::endl;\n        for (j = 1; j <= lastcol - firstcol + 1; j++) {\n            x[j] = norm_temp2 * z[j];\n        }\n    }\n    timer_stop(T_bench);\n    t = timer_read(T_bench);\n    std::cout << \" Benchmark completed \" << std::endl;\n    epsilon = 1.0e-10;\n    if (class != 'U') {\n        err = std::abs(zeta - zeta_verify_value) / zeta_verify_value;\n        if (err <= epsilon) {\n            verified = true;\n            std::cout << \" VERIFICATION SUCCESSFUL \" << std::endl;\n            std::cout << \" Zeta is    \" << std::scientific << std::setw(20) << zeta << std::endl;\n            std::cout << \" Error is   \" << std::scientific << std::setw(20) << err << std::endl;\n        } else {\n            verified = false;\n            std::cout << \" VERIFICATION FAILED\" << std::endl;\n            std::cout << \" Zeta                \" << std::scientific << std::setw(20) << zeta << std::endl;\n            std::cout << \" The correct zeta is \" << std::scientific << std::setw(20) << zeta_verify_value << std::endl;\n        }\n    } else {\n        verified = false;\n        std::cout << \" Problem size unknown\" << std::endl;\n        std::cout << \" NO VERIFICATION PERFORMED\" << std::endl;\n    }\n    if (t != 0.0) {\n        mflops = (2 * niter * na) * (3.0 + (nonzer * (nonzer + 1)) + 25.0 * (5.0 + (nonzer * (nonzer + 1))) + 3.0) / t / 1000000.0;\n    } else {\n        mflops = 0.0;\n    }\n    print_results(\"CG\", class, na, 0, 0, niter, t, mflops, \"          floating point\", verified, npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7);\n    if (!timeron) {\n        goto L999;\n    }\n    tmax = timer_read(T_bench);\n    if (tmax == 0.0) {\n        tmax = 1.0;\n    }\n    std::cout << \"  SECTION   Time (secs)\" << std::endl;\n    for (i = 1; i <= T_last; i++) {\n        t = timer_read(i);\n        if (i == t_init) {\n            std::cout << \"  \" << std::setw(8) << t_names[i] << \":\" << std::setw(9) << t << \"  (\" << std::setw(6) << t * 100.0 / tmax << \"%)\" << std::endl;\n        } else {\n            std::cout << \"  \" << std::setw(8) << t_names[i] << \":\" << std::setw(9) << t << \"  (\" << std::setw(6) << t * 100.0 / tmax << \"%)\" << std::endl;\n            if (i == t_conj_grad) {\n                t = tmax - t;\n                std::cout << \"    --> total \" << std::setw(8) << t_names[i] << \":\" << std::setw(9) << t << \"  (\" << std::setw(6) << t * 100.0 / tmax << \"%)\" << std::endl;\n            }\n        }\n    }\nL999:\n    return 0;\n}\n\nvoid makea(int n, int nonzer, int* arow, int** acol, double** aelt, double* a, int* colidx, int* rowstr, double rcond, double shift) {\n    int nn1 = 1;\n    while (nn1 < n) {\n        nn1 *= 2;\n    }\n\n    int* ivc = new int[nonzer + 1];\n    double* vc = new double[nonzer + 1];\n\n    for (int iouter = 0; iouter < n; iouter++) {\n        int nzv = nonzer;\n        sprnvc(n, nzv, nn1, vc, ivc);\n        vecset(n, vc, ivc, nzv, iouter, 0.5);\n        arow[iouter] = nzv;\n        for (int ivelt = 0; ivelt < nzv; ivelt++) {\n            acol[ivelt][iouter] = ivc[ivelt];\n            aelt[ivelt][iouter] = vc[ivelt];\n        }\n    }\n\n    sparse(a, colidx, rowstr, n, nonzer, nonzer, arow, acol, aelt, 0, n - 1, nullptr, rcond, shift);\n\n    delete[] ivc;\n    delete[] vc;\n}\n\nvoid sparse(double* a, int* colidx, int* rowstr, int n, int nz, int nonzer, int* arow, int** acol, double** aelt, int firstrow, int lastrow, int* nzloc, double rcond, double shift) {\n    int nrows = lastrow - firstrow + 1;\n\n    int* mark = new int[nrows + 1];\n    for (int j = 0; j <= nrows; j++) {\n        rowstr[j] = 0;\n    }\n\n    for (int i = 0; i < n; i++) {\n        for (int nza = 0; nza < arow[i]; nza++) {\n            int j = acol[nza][i] + 1;\n            rowstr[j] += arow[i];\n        }\n    }\n\n    rowstr[0] = 1;\n    for (int j = 1; j <= nrows; j++) {\n        rowstr[j] += rowstr[j - 1];\n    }\n\n    int nza = rowstr[nrows + 1] - 1;\n    if (nza > nz) {\n        std::cout << \"Space for matrix elements exceeded in sparse\" << std::endl;\n        std::cout << \"nza, nzmax = \" << nza << \", \" << nz << std::endl;\n        return;\n    }\n\n    for (int j = 0; j < nrows; j++) {\n        for (int k = rowstr[j]; k < rowstr[j + 1]; k++) {\n            a[k] = 0.0;\n            colidx[k] = 0;\n        }\n        if (nzloc != nullptr) {\n            nzloc[j] = 0;\n        }\n    }\n\n    double size = 1.0;\n    double ratio = pow(rcond, 1.0 / static_cast<double>(n));\n\n    for (int i = 0; i < n; i++) {\n        for (int nza = 0; nza < arow[i]; nza++) {\n            int j = acol[nza][i];\n            double scale = size * aelt[nza][i];\n            for (int nzrow = 0; nzrow < arow[i]; nzrow++) {\n                int jcol = acol[nzrow][i];\n                double va = aelt[nzrow][i] * scale;\n                if (jcol == j && j == i) {\n                    va += rcond - shift;\n                }\n                for (int k = rowstr[j]; k < rowstr[j + 1]; k++) {\n                    if (colidx[k] > jcol) {\n                        for (int kk = rowstr[j + 1] - 2; kk >= k; kk--) {\n                            if (colidx[kk] > 0) {\n                                a[kk + 1] = a[kk];\n                                colidx[kk + 1] = colidx[kk];\n                            }\n                        }\n                        colidx[k] = jcol;\n                        a[k] = 0.0;\n                        goto label40;\n                    } else if (colidx[k] == 0) {\n                        colidx[k] = jcol;\n                        goto label40;\n                    } else if (colidx[k] == jcol) {\n                        if (nzloc != nullptr) {\n                            nzloc[j] += 1;\n                        }\n                        goto label40;\n                    }\n                }\n                std::cout << \"internal error in sparse: i=\" << i << std::endl;\n                return;\n            label40:\n                a[k] += va;\n            }\n        label60:\n            continue;\n        }\n    }\n\n    for (int j = 1; j <= nrows; j++) {\n        if (nzloc != nullptr) {\n            nzloc[j] += nzloc[j - 1];\n        }\n    }\n\n    for (int j = 0; j < nrows; j++) {\n        int j1 = (j > 0) ? rowstr[j] - nzloc[j - 1] : 1;\n        int j2 = rowstr[j + 1] - nzloc[j] - 1;\n        nza = rowstr[j];\n        for (int k = j1; k <= j2; k++) {\n            a[k] = a[nza];\n            colidx[k] = colidx[nza];\n            nza++;\n        }\n    }\n\n    for (int j = 1; j <= nrows + 1; j++) {\n        rowstr[j] -= nzloc[j - 1];\n    }\n\n    nza = rowstr[nrows + 1] - 1;\n\n    delete[] mark;\n}\n\nvoid sprnvc(int n, int nz, int nn1, double* v, int* iv) {\n    double amult = 1220703125.0;\n    double tran = 0.309;\n    int nzv = 0;\n\n    while (nzv < nz) {\n        double vecelt = tran * amult;\n        double vecloc = tran * amult;\n        int i = icnvrt(vecloc, nn1) + 1;\n        if (i > n) {\n            continue;\n        }\n        bool duplicate = false;\n        for (int ii = 0; ii < nzv; ii++) {\n            if (iv[ii] == i) {\n                duplicate = true;\n                break;\n            }\n        }\n        if (!duplicate) {\n            nzv++;\n            v[nzv - 1] = vecelt;\n            iv[nzv - 1] = i;\n        }\n    }\n}\n\nint icnvrt(double x, int ipwr2) {\n    return static_cast<int>(ipwr2 * x);\n}\n\nvoid vecset(int n, double* v, int* iv, int nzv, int i, double val) {\n    bool set = false;\n    for (int k = 0; k < nzv; k++) {\n        if (iv[k] == i) {\n            v[k] = val;\n            set = true;\n            break;\n        }\n    }\n    if (!set) {\n        nzv++;\n        v[nzv - 1] = val;\n        iv[nzv - 1] = i;\n    }\n}\n\nint main() {\n    int n = 10;\n    int nonzer = 5;\n    int* arow = new int[n];\n    int** acol = new int*[nonzer + 1];\n    double** aelt = new double*[nonzer + 1];\n    for (int i = 0; i < nonzer + 1; i++) {\n        acol[i] = new int[n];\n        aelt[i] = new double[n];\n    }\n    double* a = new double[n];\n    int* colidx = new int[n];\n    int* rowstr = new int[n + 1];\n    double rcond = 0.5;\n    double shift = 0.0;\n\n    makea(n, nonzer, arow, acol, aelt, a, colidx, rowstr, rcond, shift);\n\n    // Print the resulting arrays\n    std::cout << \"arow: \";\n    for (int i = 0; i < n; i++) {\n        std::cout << arow[i] << \" \";\n    }\n    std::cout << std::endl;\n\n    std::cout << \"acol: \" << std::endl;\n    for (int i = 0; i < nonzer + 1; i++) {\n        for (int j = 0; j < n; j++) {\n            std::cout << acol[i][j] << \" \";\n        }\n        std::cout << std::endl;\n    }\n\n    std::cout << \"aelt: \" << std::endl;\n    for (int i = 0; i < nonzer + 1; i++) {\n        for (int j = 0; j < n; j++) {\n            std::cout << aelt[i][j] << \" \";\n        }\n        std::cout << std::endl;\n    }\n\n    std::cout << \"a: \";\n    for (int i = 0; i < n; i++) {\n        std::cout << a[i] << \" \";\n    }\n    std::cout << std::endl;\n\n    std::cout << \"colidx: \";\n    for (int i = 0; i < n; i++) {\n        std::cout << colidx[i] << \" \";\n    }\n    std::cout << std::endl;\n\n    std::cout << \"rowstr: \";\n    for (int i = 0; i < n + 1; i++) {\n        std::cout << rowstr[i] << \" \";\n    }\n    std::cout << std::endl;\n\n    delete[] arow;\n    for (int i = 0; i < nonzer + 1; i++) {\n        delete[] acol[i];\n        delete[] aelt[i];\n    }\n    delete[] acol;\n    delete[] aelt;\n    delete[] a;\n    delete[] colidx;\n    delete[] rowstr;\n\n    return 0;\n}\n\n\n", "label": 2}
{"code": "#include <iostream>\n#include \"npb-CPP.hpp\"\n\n#include \"globals.hpp\"\n\n/* parameters */\n#define T_BENCH\t1\n#define\tT_INIT\t2\n\n/* global variables */\n/* common /grid/ */\nstatic int is1, is2, is3, ie1, ie2, ie3;\n\n/* functions prototypes */\nstatic void setup(int *n1, int *n2, int *n3, int lt);\nstatic void mg3P(double ****u, double ***v, double ****r, double a[4], double c[4], int n1, int n2, int n3, int k);\nstatic void psinv( double ***r, double ***u, int n1, int n2, int n3, double c[4], int k);\nstatic void resid( double ***u, double ***v, double ***r, int n1, int n2, int n3, double a[4], int k );\nstatic void rprj3( double ***r, int m1k, int m2k, int m3k, double ***s, int m1j, int m2j, int m3j, int k );\nstatic void interp( double ***z, int mm1, int mm2, int mm3, double ***u, int n1, int n2, int n3, int k );\nstatic void norm2u3(double ***r, int n1, int n2, int n3, double *rnm2, double *rnmu, int nx, int ny, int nz);\nstatic void rep_nrm(double ***u, int n1, int n2, int n3, char *title, int kk);\nstatic void comm3(double ***u, int n1, int n2, int n3, int kk);\nstatic void zran3(double ***z, int n1, int n2, int n3, int nx, int ny, int k);\nstatic void showall(double ***z, int n1, int n2, int n3);\nstatic double power( double a, int n );\nstatic void bubble( double ten[M][2], int j1[M][2], int j2[M][2], int j3[M][2], int m, int ind );\nstatic void zero3(double ***z, int n1, int n2, int n3);\n/*static void nonzero(double ***z, int n1, int n2, int n3);*/\n\n/*--------------------------------------------------------------------\n      program mg\nc-------------------------------------------------------------------*/\n\nint main(int argc, char *argv[]) {\n\n    /*-------------------------------------------------------------------------\n    c k is the current level. It is passed down through subroutine args\n    c and is NOT global. it is the current iteration\n    c------------------------------------------------------------------------*/\n\n    int k, it;\n    double t, tinit, mflops;\n\n    /*-------------------------------------------------------------------------\n    c These arrays are in common because they are quite large\n    c and probably shouldn't be allocated on the stack. They\n    c are always passed as subroutine args. \n    c------------------------------------------------------------------------*/\n    \n    double ****u, ***v, ****r;\n    double a[4], c[4];\n\n    double rnm2, rnmu;\n    double epsilon = 1.0e-8;\n    int n1, n2, n3, nit;\n    double verify_value;\n    boolean verified;\n\n    int i, j, l;\n    FILE *fp;\n\n    timer_clear(T_BENCH);\n    timer_clear(T_INIT);\n\n    timer_start(T_INIT);\n\n    /*----------------------------------------------------------------------\n    c Read in and broadcast input data\n    c---------------------------------------------------------------------*/\n\n    printf(\"NAS Parallel Benchmarks 4.0 OpenMP C++ version\" \" - MG Benchmark\\n\");\n    printf(\"Developed by: Dalvan Griebler <dalvan.griebler@acad.pucrs.br> & J\u00fanior L\u00f6ff <loffjh@gmail.com>\\n\\n\");\n\n    fp = fopen(\"mg.input\", \"r\");\n    if (fp != NULL) {\n    \tprintf(\" Reading from input file mg.input\\n\");\n    \tif (fscanf(fp, \"%d\", &lt) != 1){\n            printf(\" Error in reading elements\\n\");\n            exit(1);\n        }\n    \twhile(fgetc(fp) != '\\n');\n    \tif (fscanf(fp, \"%d%d%d\", &nx[lt], &ny[lt], &nz[lt]) != 3){\n            printf(\" Error in reading elements\\n\");\n            exit(1);\n        }\n    \twhile(fgetc(fp) != '\\n');\n    \tif (fscanf(fp, \"%d\", &nit) != 1){\n            printf(\" Error in reading elements\\n\");\n            exit(1);\n        }\n    \twhile(fgetc(fp) != '\\n');\n    \tfor (i = 0; i <= 7; i++) {\n    \t    if (fscanf(fp, \"%d\", &debug_vec[i]) != 1){\n                printf(\" Error in reading elements\\n\");\n                exit(1);\n            }\n    \t}\n    \tfclose(fp);\n    } else {\n    \tprintf(\" No input file. Using compiled defaults\\n\");\n        \n    \tlt = LT_DEFAULT;\n    \tnit = NIT_DEFAULT;\n    \tnx[lt] = NX_DEFAULT;\n    \tny[lt] = NY_DEFAULT;\n    \tnz[lt] = NZ_DEFAULT;\n\n    \tfor (i = 0; i <= 7; i++) {\n    \t    debug_vec[i] = DEBUG_DEFAULT;\n    \t}\n    }\n\n    if ( (nx[lt] != ny[lt]) || (nx[lt] != nz[lt]) ) {\n\t   class_npb = 'U';\n    } else if( nx[lt] == 32 && nit == 4 ) {\n\t   class_npb = 'S';\n    } else if( nx[lt] == 64 && nit == 40 ) {\n\t   class_npb = 'W';\n    } else if( nx[lt] == 256 && nit == 20 ) {\n\t   class_npb = 'B';\n    } else if( nx[lt] == 512 && nit == 20 ) {\n\t   class_npb = 'C';\n    } else if( nx[lt] == 256 && nit == 4 ) {\n\t   class_npb = 'A';\n    } else {\n\t   class_npb = 'U';\n    }\n\n    /*--------------------------------------------------------------------\n    c  Use these for debug info:\n    c---------------------------------------------------------------------\n    c     debug_vec(0) = 1 !=> report all norms\n    c     debug_vec(1) = 1 !=> some setup information\n    c     debug_vec(1) = 2 !=> more setup information\n    c     debug_vec(2) = k => at level k or below, show result of resid\n    c     debug_vec(3) = k => at level k or below, show result of psinv\n    c     debug_vec(4) = k => at level k or below, show result of rprj\n    c     debug_vec(5) = k => at level k or below, show result of interp\n    c     debug_vec(6) = 1 => (unused)\n    c     debug_vec(7) = 1 => (unused)\n    c-------------------------------------------------------------------*/\n\n    a[0] = -8.0/3.0;\n    a[1] =  0.0;\n    a[2] =  1.0/6.0;\n    a[3] =  1.0/12.0;\n\n    if (class_npb == 'A' || class_npb == 'S' || class_npb =='W') {\n        /*--------------------------------------------------------------------\n        c     Coefficients for the S(a) smoother\n            c-------------------------------------------------------------------*/\n    \tc[0] =  -3.0/8.0;\n    \tc[1] =  1.0/32.0;\n    \tc[2] =  -1.0/64.0;\n    \tc[3] =   0.0;\n    } else {\n        /*--------------------------------------------------------------------\n        c     Coefficients for the S(b) smoother\n        c-------------------------------------------------------------------*/\n    \tc[0] =  -3.0/17.0;\n    \tc[1] =  1.0/33.0;\n    \tc[2] =  -1.0/61.0;\n    \tc[3] =   0.0;\n    }\n    \n    lb = 1;\n\n    setup(&n1,&n2,&n3,lt);\n      \n    u = (double ****)malloc((lt+1)*sizeof(double ***));\n    for (l = lt; l >=1; l--) {\n    \tu[l] = (double ***)malloc(m3[l]*sizeof(double **));\n    \tfor (k = 0; k < m3[l]; k++) {\n    \t    u[l][k] = (double **)malloc(m2[l]*sizeof(double *));\n    \t    for (j = 0; j < m2[l]; j++) {\n    \t\t  u[l][k][j] = (double *)malloc(m1[l]*sizeof(double));\n    \t    }\n    \t}\n    }\n    v = (double ***)malloc(m3[lt]*sizeof(double **));\n    for (k = 0; k < m3[lt]; k++) {\n\t    v[k] = (double **)malloc(m2[lt]*sizeof(double *));\n\t    for (j = 0; j < m2[lt]; j++) {\n\t       v[k][j] = (double *)malloc(m1[lt]*sizeof(double));\n\t    }\n    }\n    r = (double ****)malloc((lt+1)*sizeof(double ***));\n    for (l = lt; l >=1; l--) {\n    \tr[l] = (double ***)malloc(m3[l]*sizeof(double **));\n    \tfor (k = 0; k < m3[l]; k++) {\n    \t    r[l][k] = (double **)malloc(m2[l]*sizeof(double *));\n    \t    for (j = 0; j < m2[l]; j++) {\n    \t\t    r[l][k][j] = (double *)malloc(m1[l]*sizeof(double));\n    \t    }\n    \t}\n    }\n\n    zero3(u[lt],n1,n2,n3);\n    zran3(v,n1,n2,n3,nx[lt],ny[lt],lt);\n\n    norm2u3(v,n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);\n\n    \n    /*printf(\"\\n norms of random v are\\n\");\n    printf(\" %4d%19.12e%19.12e\\n\", 0, rnm2, rnmu);\n    printf(\" about to evaluate resid, k= %d\\n\", lt);*/\n\n    printf(\" Size: %3dx%3dx%3d (class_npb %1c)\\n\", nx[lt], ny[lt], nz[lt], class_npb);\n    printf(\" Iterations: %3d\\n\", nit);\n    \n\n    resid(u[lt],v,r[lt],n1,n2,n3,a,lt);\n    norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);\n\n    /*c---------------------------------------------------------------------\n    c     One iteration for startup\n    c---------------------------------------------------------------------*/\n    mg3P(u,v,r,a,c,n1,n2,n3,lt);\n    resid(u[lt],v,r[lt],n1,n2,n3,a,lt);\n\n    setup(&n1,&n2,&n3,lt);\n\n    zero3(u[lt],n1,n2,n3);\n    zran3(v,n1,n2,n3,nx[lt],ny[lt],lt);\n\n    timer_stop(T_INIT);\n\n    timer_start(T_BENCH);\n\n    resid(u[lt],v,r[lt],n1,n2,n3,a,lt);\n    norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);\n\n    for ( it = 1; it <= nit; it++) {\n    \tmg3P(u,v,r,a,c,n1,n2,n3,lt);\n    \tresid(u[lt],v,r[lt],n1,n2,n3,a,lt);\n    }\n    norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);\n\n\n    timer_stop(T_BENCH);\n\n    t = timer_read(T_BENCH);\n    tinit = timer_read(T_INIT);\n\n    verified = FALSE;\n    verify_value = 0.0;\n\n    printf(\" Initialization time: %15.3f seconds\\n\", tinit);\n    printf(\" Benchmark completed\\n\");\n\n    if (class_npb != 'U') {\n    \tif (class_npb == 'S') {\n                verify_value = 0.530770700573e-04;\n    \t} else if (class_npb == 'W') {\n                verify_value = 0.250391406439e-17;  /* 40 iterations*/\n            /*\t0.183103168997d-044 iterations*/\n    \t} else if (class_npb == 'A') {\n                verify_value = 0.2433365309e-5;\n            } else if (class_npb == 'B') {\n                verify_value = 0.180056440132e-5;\n            } else if (class_npb == 'C') {\n                verify_value = 0.570674826298e-06;\n    \t}\n\n    \tif ( fabs( rnm2 - verify_value ) <= epsilon ) {\n                verified = TRUE;\n    \t    printf(\" VERIFICATION SUCCESSFUL\\n\");\n    \t    printf(\" L2 Norm is %20.12e\\n\", rnm2);\n    \t    printf(\" Error is   %20.12e\\n\", rnm2 - verify_value);\n    \t} else {\n                verified = FALSE;\n    \t    printf(\" VERIFICATION FAILED\\n\");\n    \t    printf(\" L2 Norm is             %20.12e\\n\", rnm2);\n    \t    printf(\" The correct L2 Norm is %20.12e\\n\", verify_value);\n    \t}\n    } else {\n    \tverified = FALSE;\n    \tprintf(\" Problem size unknown\\n\");\n    \tprintf(\" NO VERIFICATION PERFORMED\\n\");\n    }\n\n    if ( t != 0.0 ) {\n    \tint nn = nx[lt]*ny[lt]*nz[lt];\n    \tmflops = 58.*nit*nn*1.0e-6 / t;\n    } else {\n\t   mflops = 0.0;\n    }\n\n    c_print_results((char*)\"MG\", class_npb, nx[lt], ny[lt], nz[lt], nit, t, mflops, (char*)\"          floating point\", \n\t\t    verified, (char*)NPBVERSION, (char*)COMPILETIME, (char*)CS1, (char*)CS2, (char*)CS3, (char*)CS4, (char*)CS5, (char*)CS6, (char*)CS7);\n    return 0;\n}\n\n/*--------------------------------------------------------------------\nc-------------------------------------------------------------------*/\n\nstatic void setup(int *n1, int *n2, int *n3, int lt) {\n\n    /*--------------------------------------------------------------------\n    c-------------------------------------------------------------------*/\n\n    int k;\n\n    for ( k = lt-1; k >= 1; k--) {\n\tnx[k] = nx[k+1]/2;\n\tny[k] = ny[k+1]/2;\n\tnz[k] = nz[k+1]/2;\n    }\n\n    for (k = 1; k <= lt; k++) {\n\tm1[k] = nx[k]+2;\n\tm2[k] = nz[k]+2;\n\tm3[k] = ny[k]+2;\n    }\n\n    is1 = 1;\n    ie1 = nx[lt];\n    *n1 = nx[lt]+2;\n    is2 = 1;\n    ie2 = ny[lt];\n    *n2 = ny[lt]+2;\n    is3 = 1;\n    ie3 = nz[lt];\n    *n3 = nz[lt]+2;\n\n    if (debug_vec[1] >=  1 ) {\n\tprintf(\" in setup, \\n\");\n\tprintf(\"  lt  nx  ny  nz  n1  n2  n3 is1 is2 is3 ie1 ie2 ie3\\n\");\n\tprintf(\"%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d\\n\",\n\t       lt,nx[lt],ny[lt],nz[lt],*n1,*n2,*n3,is1,is2,is3,ie1,ie2,ie3);\n    }\n}\n\n/*--------------------------------------------------------------------\nc-------------------------------------------------------------------*/\n\nstatic void mg3P(double ****u, double ***v, double ****r, double a[4],\n\t\t double c[4], int n1, int n2, int n3, int k) {\n\n    /*--------------------------------------------------------------------\n    c-------------------------------------------------------------------*/\n\n    /*--------------------------------------------------------------------\n    c     multigrid V-cycle routine\n    c-------------------------------------------------------------------*/\n\n    int j;\n\n    /*--------------------------------------------------------------------\n    c     down cycle.\n    c     restrict the residual from the find grid to the coarse\n    c-------------------------------------------------------------------*/\n\n    for (k = lt; k >= lb+1; k--) {\n    \tj = k-1;\n    \trprj3(r[k], m1[k], m2[k], m3[k],\n\t      r[j], m1[j], m2[j], m3[j], k);\n    }\n\n    k = lb;\n    /*--------------------------------------------------------------------\n    c     compute an approximate solution on the coarsest grid\n    c-------------------------------------------------------------------*/\n    zero3(u[k], m1[k], m2[k], m3[k]);\n    psinv(r[k], u[k], m1[k], m2[k], m3[k], c, k);\n\n    for (k = lb+1; k <= lt-1; k++) {\n    \tj = k-1;\n        /*--------------------------------------------------------------------\n        c        prolongate from level k-1  to k\n        c-------------------------------------------------------------------*/\n    \tzero3(u[k], m1[k], m2[k], m3[k]);\n    \tinterp(u[j], m1[j], m2[j], m3[j],\n    \t       u[k], m1[k], m2[k], m3[k], k);\n        /*--------------------------------------------------------------------\n        c        compute residual for level k\n        c-------------------------------------------------------------------*/\n    \tresid(u[k], r[k], r[k], m1[k], m2[k], m3[k], a, k);\n        /*--------------------------------------------------------------------\n        c        apply smoother\n        c-------------------------------------------------------------------*/\n    \tpsinv(r[k], u[k], m1[k], m2[k], m3[k], c, k);\n    }\n\n    j = lt - 1;\n    k = lt;\n    interp(u[j], m1[j], m2[j], m3[j], u[lt], n1, n2, n3, k);\n    resid(u[lt], v, r[lt], n1, n2, n3, a, k);\n    psinv(r[lt], u[lt], n1, n2, n3, c, k);\n}\n\n/*--------------------------------------------------------------------\nc-------------------------------------------------------------------*/\n\nstatic void psinv( double ***r, double ***u, int n1, int n2, int n3, double c[4], int k) {\n\n    /*--------------------------------------------------------------------\n    c-------------------------------------------------------------------*/\n\n    /*--------------------------------------------------------------------\n    c     psinv applies an approximate inverse as smoother:  u = u + Cr\n    c\n    c     This  implementation costs  15A + 4M per result, where\n    c     A and M denote the costs of Addition and Multiplication.  \n    c     Presuming coefficient c(3) is zero (the NPB assumes this,\n    c     but it is thus not a general case), 2A + 1M may be eliminated,\n    c     resulting in 13A + 3M.\n    c     Note that this vectorizes, and is also fine for cache \n    c     based machines.  \n    c-------------------------------------------------------------------*/\n\n    int i3, i2, i1;\n    double r1[M], r2[M];\n\n    for (i3 = 1; i3 < n3-1; i3++) {\n    \tfor (i2 = 1; i2 < n2-1; i2++) {\n                for (i1 = 0; i1 < n1; i1++) {\n    \t\tr1[i1] = r[i3][i2-1][i1] + r[i3][i2+1][i1]\n    \t\t    + r[i3-1][i2][i1] + r[i3+1][i2][i1];\n    \t\tr2[i1] = r[i3-1][i2-1][i1] + r[i3-1][i2+1][i1]\n    \t\t    + r[i3+1][i2-1][i1] + r[i3+1][i2+1][i1];\n    \t    }\n                for (i1 = 1; i1 < n1-1; i1++) {\n    \t\tu[i3][i2][i1] = u[i3][i2][i1]\n    \t\t    + c[0] * r[i3][i2][i1]\n    \t\t    + c[1] * ( r[i3][i2][i1-1] + r[i3][i2][i1+1]\n    \t\t\t       + r1[i1] )\n    \t\t    + c[2] * ( r2[i1] + r1[i1-1] + r1[i1+1] );\n            /*--------------------------------------------------------------------\n            c  Assume c(3) = 0    (Enable line below if c(3) not= 0)\n            c---------------------------------------------------------------------\n            c    >                     + c(3) * ( r2(i1-1) + r2(i1+1) )\n            c-------------------------------------------------------------------*/\n    \t    }\n    \t}\n    }\n\n    /*--------------------------------------------------------------------\n    c     exchange boundary points\n    c-------------------------------------------------------------------*/\n    comm3(u,n1,n2,n3,k);\n\n    if (debug_vec[0] >= 1 ) {\n    \trep_nrm(u,n1,n2,n3,(char*)\"   psinv\",k);\n    }\n\n    if ( debug_vec[3] >= k ) {\n    \tshowall(u,n1,n2,n3);\n    }\n}\n\n/*--------------------------------------------------------------------\nc-------------------------------------------------------------------*/\n\nstatic void resid( double ***u, double ***v, double ***r, int n1, int n2, int n3, double a[4], int k ) {\n\n    /*--------------------------------------------------------------------\n    c-------------------------------------------------------------------*/\n\n    /*--------------------------------------------------------------------\n    c     resid computes the residual:  r = v - Au\n    c\n    c     This  implementation costs  15A + 4M per result, where\n    c     A and M denote the costs of Addition (or Subtraction) and \n    c     Multiplication, respectively. \n    c     Presuming coefficient a(1) is zero (the NPB assumes this,\n    c     but it is thus not a general case), 3A + 1M may be eliminated,\n    c     resulting in 12A + 3M.\n    c     Note that this vectorizes, and is also fine for cache \n    c     based machines.  \n    c-------------------------------------------------------------------*/\n\n    int i3, i2, i1;\n    double u1[M], u2[M];\n    for (i3 = 1; i3 < n3-1; i3++) {\n    \tfor (i2 = 1; i2 < n2-1; i2++) {\n            for (i1 = 0; i1 < n1; i1++) {\n        \t\tu1[i1] = u[i3][i2-1][i1] + u[i3][i2+1][i1]\n        \t\t       + u[i3-1][i2][i1] + u[i3+1][i2][i1];\n        \t\tu2[i1] = u[i3-1][i2-1][i1] + u[i3-1][i2+1][i1]\n        \t\t       + u[i3+1][i2-1][i1] + u[i3+1][i2+1][i1];\n        \t    }\n        \t    for (i1 = 1; i1 < n1-1; i1++) {\n        \t\tr[i3][i2][i1] = v[i3][i2][i1]\n        \t\t    - a[0] * u[i3][i2][i1]\n                /*--------------------------------------------------------------------\n                c  Assume a(1) = 0      (Enable 2 lines below if a(1) not= 0)\n                c---------------------------------------------------------------------\n                c    >                     - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3)\n                c    >                              + u1(i1) )\n                c-------------------------------------------------------------------*/\n        \t\t- a[2] * ( u2[i1] + u1[i1-1] + u1[i1+1] )\n        \t\t      - a[3] * ( u2[i1-1] + u2[i1+1] );\n    \t    }\n    \t}\n    }\n\n    /*--------------------------------------------------------------------\n    c     exchange boundary data\n    c--------------------------------------------------------------------*/\n    comm3(r,n1,n2,n3,k);\n\n    if (debug_vec[0] >= 1 ) {\n    \trep_nrm(r,n1,n2,n3,(char*)\"   resid\",k);\n    }\n\n    if ( debug_vec[2] >= k ) {\n    \tshowall(r,n1,n2,n3);\n    }\n}\n\n/*--------------------------------------------------------------------\nc-------------------------------------------------------------------*/\n\nstatic void rprj3( double ***r, int m1k, int m2k, int m3k, double ***s, int m1j, int m2j, int m3j, int k ) {\n\n    /*--------------------------------------------------------------------\n    c-------------------------------------------------------------------*/\n\n    /*--------------------------------------------------------------------\n    c     rprj3 projects onto the next coarser grid, \n    c     using a trilinear Finite Element projection:  s = r' = P r\n    c     \n    c     This  implementation costs  20A + 4M per result, where\n    c     A and M denote the costs of Addition and Multiplication.  \n    c     Note that this vectorizes, and is also fine for cache \n    c     based machines.  \n    c-------------------------------------------------------------------*/\n\n    int j3, j2, j1, i3, i2, i1, d1, d2, d3;\n\n    double x1[M], y1[M], x2, y2;\n\n\n    if (m1k == 3) {\n        d1 = 2;\n    } else {\n        d1 = 1;\n    }\n\n    if (m2k == 3) {\n        d2 = 2;\n    } else {\n        d2 = 1;\n    }\n\n    if (m3k == 3) {\n        d3 = 2;\n    } else {\n        d3 = 1;\n    }\n    for (j3 = 1; j3 < m3j-1; j3++) {\n    \ti3 = 2*j3-d3;\n        /*C        i3 = 2*j3-1*/\n\t    for (j2 = 1; j2 < m2j-1; j2++) {\n            i2 = 2*j2-d2;\n            /*C  i2 = 2*j2-1*/\n\n            for (j1 = 1; j1 < m1j; j1++) {\n        \t\ti1 = 2*j1-d1;\n            /*C             i1 = 2*j1-1*/\n        \t\tx1[i1] = r[i3+1][i2][i1] + r[i3+1][i2+2][i1]\n        \t\t    + r[i3][i2+1][i1] + r[i3+2][i2+1][i1];\n        \t\ty1[i1] = r[i3][i2][i1] + r[i3+2][i2][i1]\n        \t\t    + r[i3][i2+2][i1] + r[i3+2][i2+2][i1];\n    \t    }\n\n            for (j1 = 1; j1 < m1j-1; j1++) {\n        \t\ti1 = 2*j1-d1;\n                /*C             i1 = 2*j1-1*/\n        \t\ty2 = r[i3][i2][i1+1] + r[i3+2][i2][i1+1]\n        \t\t    + r[i3][i2+2][i1+1] + r[i3+2][i2+2][i1+1];\n        \t\tx2 = r[i3+1][i2][i1+1] + r[i3+1][i2+2][i1+1]\n        \t\t    + r[i3][i2+1][i1+1] + r[i3+2][i2+1][i1+1];\n        \t\ts[j3][j2][j1] =\n        \t\t    0.5 * r[i3+1][i2+1][i1+1]\n        \t\t    + 0.25 * ( r[i3+1][i2+1][i1] + r[i3+1][i2+1][i1+2] + x2)\n        \t\t    + 0.125 * ( x1[i1] + x1[i1+2] + y2)\n        \t\t    + 0.0625 * ( y1[i1] + y1[i1+2] );\n    \t    }\n\t    }\n    }\n    comm3(s,m1j,m2j,m3j,k-1);\n\n    if (debug_vec[0] >= 1 ) {\n    \trep_nrm(s,m1j,m2j,m3j,(char*)\"   rprj3\",k-1);\n    }\n\n    if (debug_vec[4] >= k ) {\n    \tshowall(s,m1j,m2j,m3j);\n    }\n}\n\n/*--------------------------------------------------------------------\nc-------------------------------------------------------------------*/\n\nstatic void interp( double ***z, int mm1, int mm2, int mm3, double ***u, int n1, int n2, int n3, int k ) {\n\n    /*--------------------------------------------------------------------\n    c-------------------------------------------------------------------*/\n\n    /*--------------------------------------------------------------------\n    c     interp adds the trilinear interpolation of the correction\n    c     from the coarser grid to the current approximation:  u = u + Qu'\n    c     \n    c     Observe that this  implementation costs  16A + 4M, where\n    c     A and M denote the costs of Addition and Multiplication.  \n    c     Note that this vectorizes, and is also fine for cache \n    c     based machines.  Vector machines may get slightly better \n    c     performance however, with 8 separate \"do i1\" loops, rather than 4.\n    c-------------------------------------------------------------------*/\n\n    int i3, i2, i1, d1, d2, d3, t1, t2, t3;\n\n    /*\n    c note that m = 1037 in globals.h but for this only need to be\n    c 535 to handle up to 1024^3\n    c      integer m\n    c      parameter( m=535 )\n    */\n    double z1[M], z2[M], z3[M];\n\n    if ( n1 != 3 && n2 != 3 && n3 != 3 ) {\n    \tfor (i3 = 0; i3 < mm3-1; i3++) {\n            for (i2 = 0; i2 < mm2-1; i2++) {\n        \t\tfor (i1 = 0; i1 < mm1; i1++) {\n        \t\t    z1[i1] = z[i3][i2+1][i1] + z[i3][i2][i1];\n        \t\t    z2[i1] = z[i3+1][i2][i1] + z[i3][i2][i1];\n        \t\t    z3[i1] = z[i3+1][i2+1][i1] + z[i3+1][i2][i1] + z1[i1];\n        \t\t}\n        \t\tfor (i1 = 0; i1 < mm1-1; i1++) {\n        \t\t    u[2*i3][2*i2][2*i1] = u[2*i3][2*i2][2*i1]\n        \t\t\t+z[i3][i2][i1];\n        \t\t    u[2*i3][2*i2][2*i1+1] = u[2*i3][2*i2][2*i1+1]\n        \t\t\t+0.5*(z[i3][i2][i1+1]+z[i3][i2][i1]);\n        \t\t}\n        \t\tfor (i1 = 0; i1 < mm1-1; i1++) {\n        \t\t    u[2*i3][2*i2+1][2*i1] = u[2*i3][2*i2+1][2*i1]\n        \t\t\t+0.5 * z1[i1];\n        \t\t    u[2*i3][2*i2+1][2*i1+1] = u[2*i3][2*i2+1][2*i1+1]\n        \t\t\t+0.25*( z1[i1] + z1[i1+1] );\n        \t\t}\n        \t\tfor (i1 = 0; i1 < mm1-1; i1++) {\n        \t\t    u[2*i3+1][2*i2][2*i1] = u[2*i3+1][2*i2][2*i1]\n        \t\t\t+0.5 * z2[i1];\n        \t\t    u[2*i3+1][2*i2][2*i1+1] = u[2*i3+1][2*i2][2*i1+1]\n        \t\t\t+0.25*( z2[i1] + z2[i1+1] );\n        \t\t}\n        \t\tfor (i1 = 0; i1 < mm1-1; i1++) {\n        \t\t    u[2*i3+1][2*i2+1][2*i1] = u[2*i3+1][2*i2+1][2*i1]\n        \t\t\t+0.25* z3[i1];\n        \t\t    u[2*i3+1][2*i2+1][2*i1+1] = u[2*i3+1][2*i2+1][2*i1+1]\n        \t\t\t+0.125*( z3[i1] + z3[i1+1] );\n        \t\t}\n    \t    }\n    \t}\n    } else {\n    \tif (n1 == 3) {\n                d1 = 2;\n                t1 = 1;\n    \t} else {\n                d1 = 1;\n                t1 = 0;\n    \t}      \n    \tif (n2 == 3) {\n                d2 = 2;\n                t2 = 1;\n    \t} else {\n                d2 = 1;\n                t2 = 0;\n    \t}          \n    \tif (n3 == 3) {\n                d3 = 2;\n                t3 = 1;\n    \t} else {\n                d3 = 1;\n                t3 = 0;\n    \t}\n\n\t    for ( i3 = d3; i3 <= mm3-1; i3++) {\n            for ( i2 = d2; i2 <= mm2-1; i2++) {\n        \t\tfor ( i1 = d1; i1 <= mm1-1; i1++) {\n        \t\t    u[2*i3-d3-1][2*i2-d2-1][2*i1-d1-1] =\n        \t\t\tu[2*i3-d3-1][2*i2-d2-1][2*i1-d1-1]\n        \t\t\t+z[i3-1][i2-1][i1-1];\n        \t\t}\n        \t\tfor ( i1 = 1; i1 <= mm1-1; i1++) {\n        \t\t    u[2*i3-d3-1][2*i2-d2-1][2*i1-t1-1] =\n        \t\t\tu[2*i3-d3-1][2*i2-d2-1][2*i1-t1-1]\n        \t\t\t+0.5*(z[i3-1][i2-1][i1]+z[i3-1][i2-1][i1-1]);\n        \t\t}\n\t        }\n            for ( i2 = 1; i2 <= mm2-1; i2++) {\n        \t\tfor ( i1 = d1; i1 <= mm1-1; i1++) {\n        \t\t    u[2*i3-d3-1][2*i2-t2-1][2*i1-d1-1] =\n        \t\t\tu[2*i3-d3-1][2*i2-t2-1][2*i1-d1-1]\n        \t\t\t+0.5*(z[i3-1][i2][i1-1]+z[i3-1][i2-1][i1-1]);\n        \t\t}\n                for ( i1 = 1; i1 <= mm1-1; i1++) {\n        \t\t    u[2*i3-d3-1][2*i2-t2-1][2*i1-t1-1] =\n        \t\t\tu[2*i3-d3-1][2*i2-t2-1][2*i1-t1-1]\n        \t\t\t+0.25*(z[i3-1][i2][i1]+z[i3-1][i2-1][i1]\n        \t\t\t       +z[i3-1][i2][i1-1]+z[i3-1][i2-1][i1-1]);\n\t            }\n\t       }\n\t    }\n\n\t    for ( i3 = 1; i3 <= mm3-1; i3++) {\n            for ( i2 = d2; i2 <= mm2-1; i2++) {\n        \t\tfor ( i1 = d1; i1 <= mm1-1; i1++) {\n        \t\t    u[2*i3-t3-1][2*i2-d2-1][2*i1-d1-1] =\n        \t\t\tu[2*i3-t3-1][2*i2-d2-1][2*i1-d1-1]\n        \t\t\t+0.5*(z[i3][i2-1][i1-1]+z[i3-1][i2-1][i1-1]);\n        \t\t}\n        \t\tfor ( i1 = 1; i1 <= mm1-1; i1++) {\n        \t\t    u[2*i3-t3-1][2*i2-d2-1][2*i1-t1-1] =\n        \t\t\tu[2*i3-t3-1][2*i2-d2-1][2*i1-t1-1]\n        \t\t\t+0.25*(z[i3][i2-1][i1]+z[i3][i2-1][i1-1]\n        \t\t\t       +z[i3-1][i2-1][i1]+z[i3-1][i2-1][i1-1]);\n        \t\t}\n            }\n    \t    for ( i2 = 1; i2 <= mm2-1; i2++) {\n        \t\tfor ( i1 = d1; i1 <= mm1-1; i1++) {\n        \t\t    u[2*i3-t3-1][2*i2-t2-1][2*i1-d1-1] =\n        \t\t\tu[2*i3-t3-1][2*i2-t2-1][2*i1-d1-1]\n        \t\t\t+0.25*(z[i3][i2][i1-1]+z[i3][i2-1][i1-1]\n        \t\t\t       +z[i3-1][i2][i1-1]+z[i3-1][i2-1][i1-1]);\n        \t\t}\n        \t\tfor ( i1 = 1; i1 <= mm1-1; i1++) {\n        \t\t    u[2*i3-t3-1][2*i2-t2-1][2*i1-t1-1] =\n        \t\t\tu[2*i3-t3-1][2*i2-t2-1][2*i1-t1-1]\n        \t\t\t+0.125*(z[i3][i2][i1]+z[i3][i2-1][i1]\n        \t\t\t\t+z[i3][i2][i1-1]+z[i3][i2-1][i1-1]\n        \t\t\t\t+z[i3-1][i2][i1]+z[i3-1][i2-1][i1]\n        \t\t\t\t+z[i3-1][i2][i1-1]+z[i3-1][i2-1][i1-1]);\n        \t\t}\n    \t    }\n\t    }\n    }\n    if (debug_vec[0] >= 1 ) {\n        rep_nrm(z,mm1,mm2,mm3,(char*)\"z: inter\",k-1);\n        rep_nrm(u,n1,n2,n3,(char*)\"u: inter\",k);\n    }\n    if ( debug_vec[5] >= k ) {\n        showall(z,mm1,mm2,mm3);\n        showall(u,n1,n2,n3);\n    }\n}\n\n/*--------------------------------------------------------------------\nc-------------------------------------------------------------------*/\n\nstatic void norm2u3(double ***r, int n1, int n2, int n3, double *rnm2, double *rnmu, int nx, int ny, int nz) {\n\n    /*--------------------------------------------------------------------\n    c-------------------------------------------------------------------*/\n\n    /*--------------------------------------------------------------------\n    c     norm2u3 evaluates approximations to the L2 norm and the\n    c     uniform (or L-infinity or Chebyshev) norm, under the\n    c     assumption that the boundaries are periodic or zero.  Add the\n    c     boundaries in with half weight (quarter weight on the edges\n    c     and eighth weight at the corners) for inhomogeneous boundaries.\n    c-------------------------------------------------------------------*/\n\n    static double s = 0.0;\n    double tmp;\n    int i3, i2, i1, n;\n    double p_s = 0.0, p_a = 0.0;\n\n    n = nx*ny*nz;\n  \n    for (i3 = 1; i3 < n3-1; i3++) {\n    \tfor (i2 = 1; i2 < n2-1; i2++) {\n            for (i1 = 1; i1 < n1-1; i1++) {\n        \t\tp_s = p_s + r[i3][i2][i1] * r[i3][i2][i1];\n        \t\ttmp = fabs(r[i3][i2][i1]);\n        \t\tif (tmp > p_a) p_a = tmp;\n        \t}\n    \t}\n    }\n    \n\ts += p_s;\n\tif (p_a > *rnmu) *rnmu = p_a;\n    \n\t*rnm2 = sqrt(s/(double)n);\n\ts = 0.0;\n    \n}\n\n/*--------------------------------------------------------------------\nc-------------------------------------------------------------------*/\n\nstatic void rep_nrm(double ***u, int n1, int n2, int n3, char *title, int kk) {\n\n    /*--------------------------------------------------------------------\n    c-------------------------------------------------------------------*/\n\n    /*--------------------------------------------------------------------\n    c     report on norm\n    c-------------------------------------------------------------------*/\n\n    double rnm2, rnmu;\n    norm2u3(u,n1,n2,n3,&rnm2,&rnmu,nx[kk],ny[kk],nz[kk]);\n    printf(\" Level%2d in %8s: norms =%21.14e%21.14e\\n\", kk, title, rnm2, rnmu);\n}\n\n/*--------------------------------------------------------------------\nc-------------------------------------------------------------------*/\n\nstatic void comm3(double ***u, int n1, int n2, int n3, int kk) {\n\n    /*--------------------------------------------------------------------\n    c-------------------------------------------------------------------*/\n\n    /*--------------------------------------------------------------------\n    c     comm3 organizes the communication on all borders \n    c-------------------------------------------------------------------*/\n\n    int i1, i2, i3;\n    /* axis = 1 */\n    for ( i3 = 1; i3 < n3-1; i3++) {\n    \tfor ( i2 = 1; i2 < n2-1; i2++) {\n    \t    u[i3][i2][n1-1] = u[i3][i2][1];\n    \t    u[i3][i2][0] = u[i3][i2][n1-2];\n    \t}\n    }\n    /* axis = 2 */\n    for ( i3 = 1; i3 < n3-1; i3++) {\n    \tfor ( i1 = 0; i1 < n1; i1++) {\n    \t    u[i3][n2-1][i1] = u[i3][1][i1];\n    \t    u[i3][0][i1] = u[i3][n2-2][i1];\n    \t}\n    }\n    /* axis = 3 */\n    for ( i2 = 0; i2 < n2; i2++) {\n    \tfor ( i1 = 0; i1 < n1; i1++) {\n    \t    u[n3-1][i2][i1] = u[1][i2][i1];\n    \t    u[0][i2][i1] = u[n3-2][i2][i1];\n    \t}\n    }\n}\n\n/*--------------------------------------------------------------------\nc-------------------------------------------------------------------*/\n\nstatic void zran3(double ***z, int n1, int n2, int n3, int nx, int ny, int k) {\n\n    /*--------------------------------------------------------------------\n    c-------------------------------------------------------------------*/\n\n    /*--------------------------------------------------------------------\n    c     zran3  loads +1 at ten randomly chosen points,\n    c     loads -1 at a different ten random points,\n    c     and zero elsewhere.\n    c-------------------------------------------------------------------*/\n\n    #define MM\t10\n    #define\tA\tpow(5.0,13)\n    #define\tX\t314159265.e0    \n    \n    int i0, m0, m1;\n    /*int i1, i2, i3, d1, e1, e2, e3;*/\n    int i1, i2, i3, d1, e2, e3;\n    double xx, x0, x1, a1, a2, ai;\n\n    double ten[MM][2], best;\n    int i, j1[MM][2], j2[MM][2], j3[MM][2];\n    \n\n    /*double rdummy;*/\n\n    a1 = power( A, nx );\n    a2 = power( A, nx*ny );\n\n    zero3(z,n1,n2,n3);\n\n    i = is1-1+nx*(is2-1+ny*(is3-1));\n\n    ai = power( A, i );\n    d1 = ie1 - is1 + 1;\n    /*e1 = ie1 - is1 + 2;*/\n    e2 = ie2 - is2 + 2;\n    e3 = ie3 - is3 + 2;\n    x0 = X;\n    /*rdummy = */randlc( &x0, ai );\n    \n    for (i3 = 1; i3 < e3; i3++) {\n\t   x1 = x0;\n    \tfor (i2 = 1; i2 < e2; i2++) {\n            xx = x1;\n            vranlc( d1, &xx, A, &(z[i3][i2][0]));\n            /*rdummy = */randlc( &x1, a1 );\n    \t}\n\t   /*rdummy = */randlc( &x0, a2 );\n    }\n\n    /*--------------------------------------------------------------------\n    c       call comm3(z,n1,n2,n3)\n    c       call showall(z,n1,n2,n3)\n    c-------------------------------------------------------------------*/\n\n    /*--------------------------------------------------------------------\n    c     each processor looks for twenty candidates\n    c-------------------------------------------------------------------*/\n\n    for (i = 0; i < MM; i++) {\n    \tten[i][1] = 0.0;\n    \tj1[i][1] = 0;\n    \tj2[i][1] = 0;\n    \tj3[i][1] = 0;\n    \tten[i][0] = 1.0;\n    \tj1[i][0] = 0;\n    \tj2[i][0] = 0;\n    \tj3[i][0] = 0;\n    }\n    for (i3 = 1; i3 < n3-1; i3++) {\n    \tfor (i2 = 1; i2 < n2-1; i2++) {\n            for (i1 = 1; i1 < n1-1; i1++) {\n        \t\tif ( z[i3][i2][i1] > ten[0][1] ) {\n        \t\t    ten[0][1] = z[i3][i2][i1];\n        \t\t    j1[0][1] = i1;\n        \t\t    j2[0][1] = i2;\n        \t\t    j3[0][1] = i3;\n        \t\t    bubble( ten, j1, j2, j3, MM, 1 );\n        \t\t}\n        \t\tif ( z[i3][i2][i1] < ten[0][0] ) {\n        \t\t    ten[0][0] = z[i3][i2][i1];\n        \t\t    j1[0][0] = i1;\n        \t\t    j2[0][0] = i2;\n        \t\t    j3[0][0] = i3;\n        \t\t    bubble( ten, j1, j2, j3, MM, 0 );\n        \t\t}\n    \t    }\n    \t}\n    }\n\n    /*--------------------------------------------------------------------\n    c     Now which of these are globally best?\n    c-------------------------------------------------------------------*/\n    i1 = MM - 1;\n    i0 = MM - 1;\n    int jg[4][MM][2];\n    for (i = MM - 1 ; i >= 0; i--) {\n    \tbest = z[j3[i1][1]][j2[i1][1]][j1[i1][1]];\n    \tif (best == z[j3[i1][1]][j2[i1][1]][j1[i1][1]]) {\n            jg[0][i][1] = 0;\n            jg[1][i][1] = is1 - 1 + j1[i1][1];\n            jg[2][i][1] = is2 - 1 + j2[i1][1];\n            jg[3][i][1] = is3 - 1 + j3[i1][1];\n            i1 = i1-1;\n    \t} else {\n            jg[0][i][1] = 0;\n            jg[1][i][1] = 0;\n            jg[2][i][1] = 0;\n            jg[3][i][1] = 0;\n    \t}\n    \tten[i][1] = best;\n    \tbest = z[j3[i0][0]][j2[i0][0]][j1[i0][0]];\n    \tif (best == z[j3[i0][0]][j2[i0][0]][j1[i0][0]]) {\n            jg[0][i][0] = 0;\n            jg[1][i][0] = is1 - 1 + j1[i0][0];\n            jg[2][i][0] = is2 - 1 + j2[i0][0];\n            jg[3][i][0] = is3 - 1 + j3[i0][0];\n            i0 = i0-1;\n    \t} else {\n            jg[0][i][0] = 0;\n            jg[1][i][0] = 0;\n            jg[2][i][0] = 0;\n            jg[3][i][0] = 0;\n    \t}\n    \tten[i][0] = best;\n    }\n    m1 = i1+1;\n    m0 = i0+1;\n\n   /* printf(\" negative charges at\");\n    for (i = 0; i < MM; i++) {\n    \tif (i%5 == 0) printf(\"\\n\");\n    \tprintf(\" (%3d,%3d,%3d)\", jg[1][i][0], jg[2][i][0], jg[3][i][0]);\n    }\n    printf(\"\\n positive charges at\");\n    for (i = 0; i < MM; i++) {\n    \tif (i%5 == 0) printf(\"\\n\");\n    \tprintf(\" (%3d,%3d,%3d)\", jg[1][i][1], jg[2][i][1], jg[3][i][1]);\n    }\n    printf(\"\\n small random numbers were\\n\");\n    for (i = MM-1; i >= 0; i--) {\n\t   printf(\" %15.8e\", ten[i][0]);\n    }\n    printf(\"\\n and they were found on processor number\\n\");\n    for (i = MM-1; i >= 0; i--) {\n\t   printf(\" %4d\", jg[0][i][0]);\n    }\n    printf(\"\\n large random numbers were\\n\");\n    for (i = MM-1; i >= 0; i--) {\n\t   printf(\" %15.8e\", ten[i][1]);\n    }\n    printf(\"\\n and they were found on processor number\\n\");\n    for (i = MM-1; i >= 0; i--) {\n\t   printf(\" %4d\", jg[0][i][1]);\n    }\n    printf(\"\\n\");*/\n\n    for (i3 = 0; i3 < n3; i3++) {\n    \tfor (i2 = 0; i2 < n2; i2++) {\n            for (i1 = 0; i1 < n1; i1++) {\n    \t\t  z[i3][i2][i1] = 0.0;\n    \t    }\n    \t}\n    }\n    for (i = MM-1; i >= m0; i--) {\n\t   z[j3[i][0]][j2[i][0]][j1[i][0]] = -1.0;\n    }\n    for (i = MM-1; i >= m1; i--) {\n\t   z[j3[i][1]][j2[i][1]][j1[i][1]] = 1.0;\n    }\n    comm3(z,n1,n2,n3,k);\n\n    /*--------------------------------------------------------------------\n    c          call showall(z,n1,n2,n3)\n    c-------------------------------------------------------------------*/\n}\n\n/*--------------------------------------------------------------------\nc-------------------------------------------------------------------*/\n\nstatic void showall(double ***z, int n1, int n2, int n3) {\n\n    /*--------------------------------------------------------------------\n    c-------------------------------------------------------------------*/\n\n    int i1,i2,i3;\n    int m1, m2, m3;\n\n    m1 = min(n1,18);\n    m2 = min(n2,14);\n    m3 = min(n3,18);\n\n    printf(\"\\n\");\n    for (i3 = 0; i3 < m3; i3++) {\n    \tfor (i1 = 0; i1 < m1; i1++) {\n    \t    for (i2 = 0; i2 < m2; i2++) {\n    \t\t  printf(\"%6.3f\", z[i3][i2][i1]);\n    \t    }\n    \t    printf(\"\\n\");\n    \t}\n\t   printf(\" - - - - - - - \\n\");\n    }\n    printf(\"\\n\");\n}\n\n/*--------------------------------------------------------------------\nc-------------------------------------------------------------------*/\n\nstatic double power( double a, int n ) {\n\n    /*--------------------------------------------------------------------\n    c-------------------------------------------------------------------*/\n\n    /*--------------------------------------------------------------------\n    c     power  raises an integer, disguised as a double\n    c     precision real, to an integer power\n    c-------------------------------------------------------------------*/\n    double aj;\n    int nj;\n    /* double rdummy;*/\n    double power;\n\n    power = 1.0;\n    nj = n;\n    aj = a;\n\n    while (nj != 0) {\n\t   if( (nj%2) == 1 ) /*rdummy =  */randlc( &power, aj );\n    \t/*rdummy = */randlc( &aj, aj );\n    \tnj = nj/2;\n    }\n    \n    return (power);\n}\n\n/*--------------------------------------------------------------------\nc-------------------------------------------------------------------*/\n\nstatic void bubble( double ten[M][2], int j1[M][2], int j2[M][2], int j3[M][2], int m, int ind ) {\n\n    /*--------------------------------------------------------------------\n    c-------------------------------------------------------------------*/\n\n    /*--------------------------------------------------------------------\n    c     bubble        does a bubble sort in direction dir\n    c-------------------------------------------------------------------*/\n    double temp;\n    int i, j_temp;\n    if ( ind == 1 ) {\n    \tfor (i = 0; i < m-1; i++) {\n            if ( ten[i][ind] > ten[i+1][ind] ) {\n        \t\ttemp = ten[i+1][ind];\n        \t\tten[i+1][ind] = ten[i][ind];\n        \t\tten[i][ind] = temp;\n\n        \t\tj_temp = j1[i+1][ind];\n        \t\tj1[i+1][ind] = j1[i][ind];\n        \t\tj1[i][ind] = j_temp;\n\n        \t\tj_temp = j2[i+1][ind];\n        \t\tj2[i+1][ind] = j2[i][ind];\n        \t\tj2[i][ind] = j_temp;\n\n        \t\tj_temp = j3[i+1][ind];\n        \t\tj3[i+1][ind] = j3[i][ind];\n        \t\tj3[i][ind] = j_temp;\n    \t    } else {\n    \t\t  return;\n    \t    }\n    \t}\n        } else {\n    \tfor (i = 0; i < m-1; i++) {\n            if ( ten[i][ind] < ten[i+1][ind]){\n\n        \t\ttemp = ten[i+1][ind];\n        \t\tten[i+1][ind] = ten[i][ind];\n        \t\tten[i][ind] = temp;\n\n        \t\tj_temp = j1[i+1][ind];\n        \t\tj1[i+1][ind] = j1[i][ind];\n        \t\tj1[i][ind] = j_temp;\n\n        \t\tj_temp = j2[i+1][ind];\n        \t\tj2[i+1][ind] = j2[i][ind];\n        \t\tj2[i][ind] = j_temp;\n\n        \t\tj_temp = j3[i+1][ind];\n        \t\tj3[i+1][ind] = j3[i][ind];\n        \t\tj3[i][ind] = j_temp;\n    \t    } else {\n    \t\t  return;\n    \t    }\n    \t}\n    }\n}\n\n/*--------------------------------------------------------------------\nc-------------------------------------------------------------------*/\n\nstatic void zero3(double ***z, int n1, int n2, int n3) {\n\n    /*--------------------------------------------------------------------\n    c-------------------------------------------------------------------*/\n\n    int i1, i2, i3;\n    for (i3 = 0;i3 < n3; i3++) {\n    \tfor (i2 = 0; i2 < n2; i2++) {\n            for (i1 = 0; i1 < n1; i1++) {\n    \t\t  z[i3][i2][i1] = 0.0;\n    \t    }\n    \t}\n    }\n}", "label": 2}
{"code": "\n#include \"../common/npb-CPP.hpp\"\n#include \"npbparams.hpp\"\n\n#define IMAX PROBLEM_SIZE\n#define JMAX PROBLEM_SIZE\n#define KMAX PROBLEM_SIZE\n#define IMAXP (IMAX/2*2)\n#define JMAXP (JMAX/2*2)\n#define AA 0\n#define BB 1\n#define CC 2\n#define BLOCK_SIZE 5\n#define T_TOTAL 1\n#define T_RHSX 2\n#define T_RHSY 3\n#define T_RHSZ 4\n#define T_RHS 5\n#define T_XSOLVE 6\n#define T_YSOLVE 7\n#define T_ZSOLVE 8\n#define T_RDIS1 9\n#define T_RDIS2 10\n#define T_ADD 11\n#define T_LAST 11\n\n/* global variables */\n#if defined(DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION)\nstatic double us[KMAX][JMAXP+1][IMAXP+1];\nstatic double vs[KMAX][JMAXP+1][IMAXP+1];\nstatic double ws[KMAX][JMAXP+1][IMAXP+1];\nstatic double qs[KMAX][JMAXP+1][IMAXP+1];\nstatic double rho_i[KMAX][JMAXP+1][IMAXP+1];\nstatic double square[KMAX][JMAXP+1][IMAXP+1];\nstatic double forcing[KMAX][JMAXP+1][IMAXP+1][5];\nstatic double u[KMAX][JMAXP+1][IMAXP+1][5];\nstatic double rhs[KMAX][JMAXP+1][IMAXP+1][5];\nstatic double cuf[PROBLEM_SIZE+1];\nstatic double q[PROBLEM_SIZE+1];\nstatic double ue[5][PROBLEM_SIZE+1];\nstatic double buf[5][PROBLEM_SIZE+1];\nstatic double fjac[PROBLEM_SIZE+1][5][5];\nstatic double njac[PROBLEM_SIZE+1][5][5];\nstatic double lhs[PROBLEM_SIZE+1][3][5][5];\nstatic double ce[13][5];\n#else\nstatic double (*us)[JMAXP+1][IMAXP+1]=(double(*)[JMAXP+1][IMAXP+1])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)));\nstatic double (*vs)[JMAXP+1][IMAXP+1]=(double(*)[JMAXP+1][IMAXP+1])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)));\nstatic double (*ws)[JMAXP+1][IMAXP+1]=(double(*)[JMAXP+1][IMAXP+1])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)));\nstatic double (*qs)[JMAXP+1][IMAXP+1]=(double(*)[JMAXP+1][IMAXP+1])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)));\nstatic double (*rho_i)[JMAXP+1][IMAXP+1]=(double(*)[JMAXP+1][IMAXP+1])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)));\nstatic double (*square)[JMAXP+1][IMAXP+1]=(double(*)[JMAXP+1][IMAXP+1])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)));\nstatic double (*forcing)[JMAXP+1][IMAXP+1][5]=(double(*)[JMAXP+1][IMAXP+1][5])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)*(5)));\nstatic double (*u)[JMAXP+1][IMAXP+1][5]=(double(*)[JMAXP+1][IMAXP+1][5])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)*(5)));\nstatic double (*rhs)[JMAXP+1][IMAXP+1][5]=(double(*)[JMAXP+1][IMAXP+1][5])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)*(5)));\nstatic double (*cuf)=(double*)malloc(sizeof(double)*(PROBLEM_SIZE+1));\nstatic double (*q)=(double*)malloc(sizeof(double)*(PROBLEM_SIZE+1));\nstatic double (*ue)[PROBLEM_SIZE+1]=(double(*)[PROBLEM_SIZE+1])malloc(sizeof(double)*((PROBLEM_SIZE+1)*(5)));\nstatic double (*buf)[PROBLEM_SIZE+1]=(double(*)[PROBLEM_SIZE+1])malloc(sizeof(double)*((PROBLEM_SIZE+1)*(5)));\nstatic double (*fjac)[5][5]=(double(*)[5][5])malloc(sizeof(double)*((PROBLEM_SIZE+1)*(5)*(5)));\nstatic double (*njac)[5][5]=(double(*)[5][5])malloc(sizeof(double)*((PROBLEM_SIZE+1)*(5)*(5)));\ndouble (*lhs)[3][5][5]=(double(*)[3][5][5])malloc(sizeof(double)*((PROBLEM_SIZE+1)*(3)*(5)*(5)));\nstatic double (*ce)[5]=(double(*)[5])malloc(sizeof(double)*((13)*(5)));\n#endif\nstatic double tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, \n\t      dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, \n\t      dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, \n\t      dxmax, dymax, dzmax, xxcon1, xxcon2, \n\t      xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,\n\t      dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,\n\t      yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,\n\t      zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, \n\t      dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, \n\t      dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, \n\t      c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1,\n\t      dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, \n\t      c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, \n\t      c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16,\n\t      elapsed_time, tmp1, tmp2, tmp3;\nstatic int grid_points[3];\nstatic boolean timeron;\n\n/* function prototypes */\nstatic void add();\nstatic void adi();\nstatic void binvcrhs(double lhs[5][5], double c[5][5], double r[5]);\nstatic void binvrhs(double lhs[5][5], double r[5]);\nstatic void compute_rhs();\nstatic void error_norm(double rms[5]);\nstatic void exact_rhs();\nstatic void exact_solution(double xi, double eta, double zeta, double dtemp[5]);\nstatic void initialize();\nstatic void lhsinit(double lhs[][3][5][5], int size);\nstatic void matmul_sub(double ablock[5][5], double bblock[5][5], double cblock[5][5]);\nstatic void matvec_sub(double ablock[5][5], double avec[5], double bvec[5]);\nstatic void rhs_norm(double rms[5]);\nstatic void set_constants();\nstatic void verify(int no_time_steps, char* class_npb, boolean* verified);\nstatic void x_solve();\nstatic void y_solve();\nstatic void z_solve();\n\n/* bt */\nint main(int argc, char* argv[]){\n#if defined(DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION)\n\tprintf(\" DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION mode on\\n\");\n#endif\n\tint i, niter, step;\n\tdouble navg, mflops, n3;\n\tdouble tmax, t, trecs[T_LAST+1];\n\tboolean verified;\n\tchar class_npb;\n\tchar* t_names[T_LAST+1];\n\t/*\n\t * ---------------------------------------------------------------------\n\t * root node reads input file (if it exists) else takes\n\t * defaults from parameters\n\t * ---------------------------------------------------------------------\n\t */\n\tFILE* fp;\n\tif((fp=fopen(\"inputbt.data\",\"r\"))!=NULL){\n\t\tint avoid_warning;\n\t\tprintf(\" Reading from input file inputbt.data\\n\");\n\t\tavoid_warning=fscanf(fp,\"%d\",&niter);\n\t\twhile(fgetc(fp)!='\\n');\n\t\tavoid_warning=fscanf(fp,\"%lf\",&dt);\n\t\twhile(fgetc(fp)!='\\n');\n\t\tavoid_warning=fscanf(fp,\"%d%d%d\\n\",&grid_points[0],&grid_points[1],&grid_points[2]);\n\t\tfclose(fp);\n\t}else{\n\t\tprintf(\" No input file inputbt.data. Using compiled defaults\\n\");\n\t\tniter=NITER_DEFAULT;\n\t\tdt=DT_DEFAULT;\n\t\tgrid_points[0]=PROBLEM_SIZE;\n\t\tgrid_points[1]=PROBLEM_SIZE;\n\t\tgrid_points[2]=PROBLEM_SIZE;\n\t}\n\tif((fp=fopen(\"timer.flag\",\"r\"))!= NULL){\n\t\ttimeron=TRUE;\n\t\tt_names[T_TOTAL]=(char*)\"total\";\n\t\tt_names[T_RHSX]=(char*)\"rhsx\";\n\t\tt_names[T_RHSY]=(char*)\"rhsy\";\n\t\tt_names[T_RHSZ]=(char*)\"rhsz\";\n\t\tt_names[T_RHS]=(char*)\"rhs\";\n\t\tt_names[T_XSOLVE]=(char*)\"xsolve\";\n\t\tt_names[T_YSOLVE]=(char*)\"ysolve\";\n\t\tt_names[T_ZSOLVE]=(char*)\"zsolve\";\n\t\tt_names[T_RDIS1]=(char*)\"redist1\";\n\t\tt_names[T_RDIS2]=(char*)\"redist2\";\n\t\tt_names[T_ADD]=(char*)\"add\";\n\t\tfclose(fp);\n\t}else{\n\t\ttimeron=FALSE;\n\t}\n\tprintf(\"\\n\\n NAS Parallel Benchmarks 4.1 Serial C++ version - BT Benchmark\\n\\n\");\n\tprintf(\" Size: %4dx%4dx%4d\\n\",grid_points[0],grid_points[1],grid_points[2]);\n\tprintf(\" Iterations: %4d    dt: %10.6f\\n\",niter,dt);\n\tprintf(\"\\n\");\n\tif((grid_points[0]>IMAX)||(grid_points[1]>JMAX)||(grid_points[2]>KMAX)){\n\t\tprintf(\" %d, %d, %d\\n\",grid_points[0],grid_points[1],grid_points[2]);\n\t\tprintf(\" Problem size too big for compiled array sizes\\n\");\n\t\treturn 0;\n\t}\n\tset_constants();\n\tfor(i=1;i<=T_LAST;i++){timer_clear(i);}\n\tinitialize();\n\texact_rhs();\n\t/*\n\t * ---------------------------------------------------------------------\n\t * do one time step to touch all code, and reinitialize\n\t * ---------------------------------------------------------------------\n\t */\n\tadi();\n\tinitialize();\n\tfor(i=1;i<=T_LAST;i++){timer_clear(i);}\n\ttimer_start(1);\n\tfor(step=1; step<=niter; step++){\n\t\tif((step%20)==0||step==1){\n\t\t\tprintf(\" Time step %4d\\n\",step);\n\t\t}\n\t\tadi();\n\t}\n\ttimer_stop(1);\n\ttmax=timer_read(1);\n\tverify(niter, &class_npb, &verified);\n\tn3=1.0*grid_points[0]*grid_points[1]*grid_points[2];\n\tnavg=(grid_points[0]+grid_points[1]+grid_points[2])/3.0;\n\tif(tmax!=0.0){\n\t\tmflops=1.0e-6*(double)niter*\n\t\t\t(3478.8*n3-17655.7*(navg*navg)+28023.7*navg)\n\t\t\t/tmax;\n\t}else{\n\t\tmflops=0.0;\n\t}\n\tc_print_results((char*)\"BT\",\n\t\t\tclass_npb,\n\t\t\tgrid_points[0],\n\t\t\tgrid_points[1],\n\t\t\tgrid_points[2],\n\t\t\tniter,\n\t\t\ttmax,\n\t\t\tmflops,\n\t\t\t(char*)\"          floating point\",\n\t\t\tverified,\n\t\t\t(char*)NPBVERSION,\n\t\t\t(char*)COMPILETIME,\n\t\t\t(char*)COMPILERVERSION,\n\t\t\t(char*)CS1,\n\t\t\t(char*)CS2,\n\t\t\t(char*)CS3,\n\t\t\t(char*)CS4,\n\t\t\t(char*)CS5,\n\t\t\t(char*)CS6,\n\t\t\t(char*)\"(none)\");\n\t/*\n\t * ---------------------------------------------------------------------\n\t * more timers\n\t * ---------------------------------------------------------------------\n\t */\n\tif(timeron){\n\t\tfor(i=1; i<=T_LAST; i++){\n\t\t\ttrecs[i]=timer_read(i);\n\t\t}\n\t\tif(tmax==0.0){tmax=1.0;}\n\t\tprintf(\"  SECTION   Time (secs)\\n\");\n\t\tfor(i=1; i<=T_LAST; i++){\n\t\t\tprintf(\"  %-8s:%9.3f  (%6.2f%%)\\n\", \n\t\t\t\t\tt_names[i], trecs[i], trecs[i]*100./tmax);\n\t\t\tif(i==T_RHS){\n\t\t\t\tt=trecs[T_RHSX]+trecs[T_RHSY]+trecs[T_RHSZ];\n\t\t\t\tprintf(\"    --> %8s:%9.3f  (%6.2f%%)\\n\",\"sub-rhs\",t,t*100./tmax);\n\t\t\t\tt=trecs[T_RHS]-t;\n\t\t\t\tprintf(\"    --> %8s:%9.3f  (%6.2f%%)\\n\", \"rest-rhs\",t,t*100./tmax);\n\t\t\t}else if(i==T_ZSOLVE){\n\t\t\t\tt=trecs[T_ZSOLVE]-trecs[T_RDIS1]-trecs[T_RDIS2];\n\t\t\t\tprintf(\"    --> %8s:%9.3f  (%6.2f%%)\\n\", \"sub-zsol\",t,t*100./tmax);\n\t\t\t}else if(i==T_RDIS2){\n\t\t\t\tt=trecs[T_RDIS1]+trecs[T_RDIS2];\n\t\t\t\tprintf(\"    --> %8s:%9.3f  (%6.2f%%)\\n\",\"redist\",t,t*100./tmax);\n\t\t\t}\n\t\t}\n\t}\n\treturn 0;\n}\n\n/*\n * ---------------------------------------------------------------------\n * addition of update to the vector u\n * ---------------------------------------------------------------------\n */\nvoid add(){\n\tint i, j, k, m;\n\tif(timeron){timer_start(T_ADD);}\n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tu[k][j][i][m]=u[k][j][i][m]+rhs[k][j][i][m];\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_ADD);}\n}\n\nvoid adi(){\n\tcompute_rhs();\n\tx_solve();\n\ty_solve();\n\tz_solve();\n\tadd();\n}\n\nvoid binvcrhs(double lhs[5][5], double c[5][5], double r[5]){\n\tdouble pivot, coeff;\n\tpivot=1.00/lhs[0][0];\n\tlhs[1][0]=lhs[1][0]*pivot;\n\tlhs[2][0]=lhs[2][0]*pivot;\n\tlhs[3][0]=lhs[3][0]*pivot;\n\tlhs[4][0]=lhs[4][0]*pivot;\n\tc[0][0]=c[0][0]*pivot;\n\tc[1][0]=c[1][0]*pivot;\n\tc[2][0]=c[2][0]*pivot;\n\tc[3][0]=c[3][0]*pivot;\n\tc[4][0]=c[4][0]*pivot;\n\tr[0]=r[0]*pivot;\n\t/* */\n\tcoeff=lhs[0][1];\n\tlhs[1][1]=lhs[1][1]-coeff*lhs[1][0];\n\tlhs[2][1]=lhs[2][1]-coeff*lhs[2][0];\n\tlhs[3][1]=lhs[3][1]-coeff*lhs[3][0];\n\tlhs[4][1]=lhs[4][1]-coeff*lhs[4][0];\n\tc[0][1]=c[0][1]-coeff*c[0][0];\n\tc[1][1]=c[1][1]-coeff*c[1][0];\n\tc[2][1]=c[2][1]-coeff*c[2][0];\n\tc[3][1]=c[3][1]-coeff*c[3][0];\n\tc[4][1]=c[4][1]-coeff*c[4][0];\n\tr[1]=r[1]-coeff*r[0];\n\t/* */\n\tcoeff=lhs[0][2];\n\tlhs[1][2]=lhs[1][2]-coeff*lhs[1][0];\n\tlhs[2][2]=lhs[2][2]-coeff*lhs[2][0];\n\tlhs[3][2]=lhs[3][2]-coeff*lhs[3][0];\n\tlhs[4][2]=lhs[4][2]-coeff*lhs[4][0];\n\tc[0][2]=c[0][2]-coeff*c[0][0];\n\tc[1][2]=c[1][2]-coeff*c[1][0];\n\tc[2][2]=c[2][2]-coeff*c[2][0];\n\tc[3][2]=c[3][2]-coeff*c[3][0];\n\tc[4][2]=c[4][2]-coeff*c[4][0];\n\tr[2]=r[2]-coeff*r[0];\n\t/* */\n\tcoeff=lhs[0][3];\n\tlhs[1][3]=lhs[1][3]-coeff*lhs[1][0];\n\tlhs[2][3]=lhs[2][3]-coeff*lhs[2][0];\n\tlhs[3][3]=lhs[3][3]-coeff*lhs[3][0];\n\tlhs[4][3]=lhs[4][3]-coeff*lhs[4][0];\n\tc[0][3]=c[0][3]-coeff*c[0][0];\n\tc[1][3]=c[1][3]-coeff*c[1][0];\n\tc[2][3]=c[2][3]-coeff*c[2][0];\n\tc[3][3]=c[3][3]-coeff*c[3][0];\n\tc[4][3]=c[4][3]-coeff*c[4][0];\n\tr[3]=r[3]-coeff*r[0];\n\t/* */\n\tcoeff=lhs[0][4];\n\tlhs[1][4]=lhs[1][4]-coeff*lhs[1][0];\n\tlhs[2][4]=lhs[2][4]-coeff*lhs[2][0];\n\tlhs[3][4]=lhs[3][4]-coeff*lhs[3][0];\n\tlhs[4][4]=lhs[4][4]-coeff*lhs[4][0];\n\tc[0][4]=c[0][4]-coeff*c[0][0];\n\tc[1][4]=c[1][4]-coeff*c[1][0];\n\tc[2][4]=c[2][4]-coeff*c[2][0];\n\tc[3][4]=c[3][4]-coeff*c[3][0];\n\tc[4][4]=c[4][4]-coeff*c[4][0];\n\tr[4]=r[4]-coeff*r[0];\n\t/* */\n\tpivot=1.00/lhs[1][1];\n\tlhs[2][1]=lhs[2][1]*pivot;\n\tlhs[3][1]=lhs[3][1]*pivot;\n\tlhs[4][1]=lhs[4][1]*pivot;\n\tc[0][1]=c[0][1]*pivot;\n\tc[1][1]=c[1][1]*pivot;\n\tc[2][1]=c[2][1]*pivot;\n\tc[3][1]=c[3][1]*pivot;\n\tc[4][1]=c[4][1]*pivot;\n\tr[1]=r[1]*pivot;\n\t/* */\n\tcoeff=lhs[1][0];\n\tlhs[2][0]=lhs[2][0]-coeff*lhs[2][1];\n\tlhs[3][0]=lhs[3][0]-coeff*lhs[3][1];\n\tlhs[4][0]=lhs[4][0]-coeff*lhs[4][1];\n\tc[0][0]=c[0][0]-coeff*c[0][1];\n\tc[1][0]=c[1][0]-coeff*c[1][1];\n\tc[2][0]=c[2][0]-coeff*c[2][1];\n\tc[3][0]=c[3][0]-coeff*c[3][1];\n\tc[4][0]=c[4][0]-coeff*c[4][1];\n\tr[0]=r[0]-coeff*r[1];\n\t/* */\n\tcoeff = lhs[1][2];\n\tlhs[2][2]=lhs[2][2]-coeff*lhs[2][1];\n\tlhs[3][2]=lhs[3][2]-coeff*lhs[3][1];\n\tlhs[4][2]=lhs[4][2]-coeff*lhs[4][1];\n\tc[0][2]=c[0][2]-coeff*c[0][1];\n\tc[1][2]=c[1][2]-coeff*c[1][1];\n\tc[2][2]=c[2][2]-coeff*c[2][1];\n\tc[3][2]=c[3][2]-coeff*c[3][1];\n\tc[4][2]=c[4][2]-coeff*c[4][1];\n\tr[2]=r[2]-coeff*r[1];\n\t/* */\n\tcoeff=lhs[1][3];\n\tlhs[2][3]=lhs[2][3]-coeff*lhs[2][1];\n\tlhs[3][3]=lhs[3][3]-coeff*lhs[3][1];\n\tlhs[4][3]=lhs[4][3]-coeff*lhs[4][1];\n\tc[0][3]=c[0][3]-coeff*c[0][1];\n\tc[1][3]=c[1][3]-coeff*c[1][1];\n\tc[2][3]=c[2][3]-coeff*c[2][1];\n\tc[3][3]=c[3][3]-coeff*c[3][1];\n\tc[4][3]=c[4][3]-coeff*c[4][1];\n\tr[3]=r[3]-coeff*r[1];\n\t/* */\n\tcoeff=lhs[1][4];\n\tlhs[2][4]=lhs[2][4]-coeff*lhs[2][1];\n\tlhs[3][4]=lhs[3][4]-coeff*lhs[3][1];\n\tlhs[4][4]=lhs[4][4]-coeff*lhs[4][1];\n\tc[0][4]=c[0][4]-coeff*c[0][1];\n\tc[1][4]=c[1][4]-coeff*c[1][1];\n\tc[2][4]=c[2][4]-coeff*c[2][1];\n\tc[3][4]=c[3][4]-coeff*c[3][1];\n\tc[4][4]=c[4][4]-coeff*c[4][1];\n\tr[4]=r[4]-coeff*r[1];\n\t/* */\n\tpivot = 1.00/lhs[2][2];\n\tlhs[3][2]=lhs[3][2]*pivot;\n\tlhs[4][2]=lhs[4][2]*pivot;\n\tc[0][2]=c[0][2]*pivot;\n\tc[1][2]=c[1][2]*pivot;\n\tc[2][2]=c[2][2]*pivot;\n\tc[3][2]=c[3][2]*pivot;\n\tc[4][2]=c[4][2]*pivot;\n\tr[2]=r[2]*pivot;\n\t/* */\n\tcoeff=lhs[2][0];\n\tlhs[3][0]=lhs[3][0]-coeff*lhs[3][2];\n\tlhs[4][0]=lhs[4][0]-coeff*lhs[4][2];\n\tc[0][0]=c[0][0]-coeff*c[0][2];\n\tc[1][0]=c[1][0]-coeff*c[1][2];\n\tc[2][0]=c[2][0]-coeff*c[2][2];\n\tc[3][0]=c[3][0]-coeff*c[3][2];\n\tc[4][0]=c[4][0]-coeff*c[4][2];\n\tr[0]=r[0]-coeff*r[2];\n\t/* */\n\tcoeff=lhs[2][1];\n\tlhs[3][1]=lhs[3][1]-coeff*lhs[3][2];\n\tlhs[4][1]=lhs[4][1]-coeff*lhs[4][2];\n\tc[0][1]=c[0][1]-coeff*c[0][2];\n\tc[1][1]=c[1][1]-coeff*c[1][2];\n\tc[2][1]=c[2][1]-coeff*c[2][2];\n\tc[3][1]=c[3][1]-coeff*c[3][2];\n\tc[4][1]=c[4][1]-coeff*c[4][2];\n\tr[1]=r[1]-coeff*r[2];\n\t/* */\n\tcoeff=lhs[2][3];\n\tlhs[3][3]=lhs[3][3]-coeff*lhs[3][2];\n\tlhs[4][3]=lhs[4][3]-coeff*lhs[4][2];\n\tc[0][3]=c[0][3]-coeff*c[0][2];\n\tc[1][3]=c[1][3]-coeff*c[1][2];\n\tc[2][3]=c[2][3]-coeff*c[2][2];\n\tc[3][3]=c[3][3]-coeff*c[3][2];\n\tc[4][3]=c[4][3]-coeff*c[4][2];\n\tr[3]=r[3]-coeff*r[2];\n\t/* */\n\tcoeff=lhs[2][4];\n\tlhs[3][4]=lhs[3][4]-coeff*lhs[3][2];\n\tlhs[4][4]=lhs[4][4]-coeff*lhs[4][2];\n\tc[0][4]=c[0][4]-coeff*c[0][2];\n\tc[1][4]=c[1][4]-coeff*c[1][2];\n\tc[2][4]=c[2][4]-coeff*c[2][2];\n\tc[3][4]=c[3][4]-coeff*c[3][2];\n\tc[4][4]=c[4][4]-coeff*c[4][2];\n\tr[4]=r[4]-coeff*r[2];\n\t/* */\n\tpivot=1.00/lhs[3][3];\n\tlhs[4][3]=lhs[4][3]*pivot;\n\tc[0][3]=c[0][3]*pivot;\n\tc[1][3]=c[1][3]*pivot;\n\tc[2][3]=c[2][3]*pivot;\n\tc[3][3]=c[3][3]*pivot;\n\tc[4][3]=c[4][3]*pivot;\n\tr[3]=r[3] *pivot;\n\t/* */\n\tcoeff=lhs[3][0];\n\tlhs[4][0]=lhs[4][0]-coeff*lhs[4][3];\n\tc[0][0]=c[0][0]-coeff*c[0][3];\n\tc[1][0]=c[1][0]-coeff*c[1][3];\n\tc[2][0]=c[2][0]-coeff*c[2][3];\n\tc[3][0]=c[3][0]-coeff*c[3][3];\n\tc[4][0]=c[4][0]-coeff*c[4][3];\n\tr[0]=r[0]-coeff*r[3];\n\t/* */\n\tcoeff=lhs[3][1];\n\tlhs[4][1]=lhs[4][1]-coeff*lhs[4][3];\n\tc[0][1]=c[0][1]-coeff*c[0][3];\n\tc[1][1]=c[1][1]-coeff*c[1][3];\n\tc[2][1]=c[2][1]-coeff*c[2][3];\n\tc[3][1]=c[3][1]-coeff*c[3][3];\n\tc[4][1]=c[4][1]-coeff*c[4][3];\n\tr[1]=r[1]-coeff*r[3];\n\t/* */\n\tcoeff=lhs[3][2];\n\tlhs[4][2]=lhs[4][2]-coeff*lhs[4][3];\n\tc[0][2]=c[0][2]-coeff*c[0][3];\n\tc[1][2]=c[1][2]-coeff*c[1][3];\n\tc[2][2]=c[2][2]-coeff*c[2][3];\n\tc[3][2]=c[3][2]-coeff*c[3][3];\n\tc[4][2]=c[4][2]-coeff*c[4][3];\n\tr[2]=r[2]-coeff*r[3];\n\t/* */\n\tcoeff=lhs[3][4];\n\tlhs[4][4]=lhs[4][4]-coeff*lhs[4][3];\n\tc[0][4]=c[0][4]-coeff*c[0][3];\n\tc[1][4]=c[1][4]-coeff*c[1][3];\n\tc[2][4]=c[2][4]-coeff*c[2][3];\n\tc[3][4]=c[3][4]-coeff*c[3][3];\n\tc[4][4]=c[4][4]-coeff*c[4][3];\n\tr[4]=r[4]-coeff*r[3];\n\t/* */\n\tpivot=1.00/lhs[4][4];\n\tc[0][4]=c[0][4]*pivot;\n\tc[1][4]=c[1][4]*pivot;\n\tc[2][4]=c[2][4]*pivot;\n\tc[3][4]=c[3][4]*pivot;\n\tc[4][4]=c[4][4]*pivot;\n\tr[4]=r[4]*pivot;\n\t/* */\n\tcoeff=lhs[4][0];\n\tc[0][0]=c[0][0]-coeff*c[0][4];\n\tc[1][0]=c[1][0]-coeff*c[1][4];\n\tc[2][0]=c[2][0]-coeff*c[2][4];\n\tc[3][0]=c[3][0]-coeff*c[3][4];\n\tc[4][0]=c[4][0]-coeff*c[4][4];\n\tr[0]=r[0]-coeff*r[4];\n\t/* */\n\tcoeff=lhs[4][1];\n\tc[0][1]=c[0][1]-coeff*c[0][4];\n\tc[1][1]=c[1][1]-coeff*c[1][4];\n\tc[2][1]=c[2][1]-coeff*c[2][4];\n\tc[3][1]=c[3][1]-coeff*c[3][4];\n\tc[4][1]=c[4][1]-coeff*c[4][4];\n\tr[1]=r[1]-coeff*r[4];\n\t/* */\n\tcoeff=lhs[4][2];\n\tc[0][2]=c[0][2]-coeff*c[0][4];\n\tc[1][2]=c[1][2]-coeff*c[1][4];\n\tc[2][2]=c[2][2]-coeff*c[2][4];\n\tc[3][2]=c[3][2]-coeff*c[3][4];\n\tc[4][2]=c[4][2]-coeff*c[4][4];\n\tr[2]=r[2]-coeff*r[4];\n\t/* */\n\tcoeff=lhs[4][3];\n\tc[0][3]=c[0][3]-coeff*c[0][4];\n\tc[1][3]=c[1][3]-coeff*c[1][4];\n\tc[2][3]=c[2][3]-coeff*c[2][4];\n\tc[3][3]=c[3][3]-coeff*c[3][4];\n\tc[4][3]=c[4][3]-coeff*c[4][4];\n\tr[3]=r[3]-coeff*r[4];\n}\n\nvoid binvrhs(double lhs[5][5], double r[5]){\n\tdouble pivot, coeff;\n\tpivot=1.00/lhs[0][0];\n\tlhs[1][0]=lhs[1][0]*pivot;\n\tlhs[2][0]=lhs[2][0]*pivot;\n\tlhs[3][0]=lhs[3][0]*pivot;\n\tlhs[4][0]=lhs[4][0]*pivot;\n\tr[0]=r[0]*pivot;\n\t/* */\n\tcoeff=lhs[0][1];\n\tlhs[1][1]=lhs[1][1]-coeff*lhs[1][0];\n\tlhs[2][1]=lhs[2][1]-coeff*lhs[2][0];\n\tlhs[3][1]=lhs[3][1]-coeff*lhs[3][0];\n\tlhs[4][1]=lhs[4][1]-coeff*lhs[4][0];\n\tr[1]=r[1]-coeff*r[0];\n\t/* */\n\tcoeff=lhs[0][2];\n\tlhs[1][2]=lhs[1][2]-coeff*lhs[1][0];\n\tlhs[2][2]=lhs[2][2]-coeff*lhs[2][0];\n\tlhs[3][2]=lhs[3][2]-coeff*lhs[3][0];\n\tlhs[4][2]=lhs[4][2]-coeff*lhs[4][0];\n\tr[2]=r[2]-coeff*r[0];\n\t/* */\n\tcoeff=lhs[0][3];\n\tlhs[1][3]=lhs[1][3]-coeff*lhs[1][0];\n\tlhs[2][3]=lhs[2][3]-coeff*lhs[2][0];\n\tlhs[3][3]=lhs[3][3]-coeff*lhs[3][0];\n\tlhs[4][3]=lhs[4][3]-coeff*lhs[4][0];\n\tr[3]=r[3]-coeff*r[0];\n\t/* */\n\tcoeff=lhs[0][4];\n\tlhs[1][4]=lhs[1][4]-coeff*lhs[1][0];\n\tlhs[2][4]=lhs[2][4]-coeff*lhs[2][0];\n\tlhs[3][4]=lhs[3][4]-coeff*lhs[3][0];\n\tlhs[4][4]=lhs[4][4]-coeff*lhs[4][0];\n\tr[4]=r[4]-coeff*r[0];\n\t/* */\n\tpivot=1.00/lhs[1][1];\n\tlhs[2][1]=lhs[2][1]*pivot;\n\tlhs[3][1]=lhs[3][1]*pivot;\n\tlhs[4][1]=lhs[4][1]*pivot;\n\tr[1]=r[1]*pivot;\n\t/* */\n\tcoeff=lhs[1][0];\n\tlhs[2][0]=lhs[2][0]-coeff*lhs[2][1];\n\tlhs[3][0]=lhs[3][0]-coeff*lhs[3][1];\n\tlhs[4][0]=lhs[4][0]-coeff*lhs[4][1];\n\tr[0]=r[0]-coeff*r[1];\n\t/* */\n\tcoeff=lhs[1][2];\n\tlhs[2][2]=lhs[2][2]-coeff*lhs[2][1];\n\tlhs[3][2]=lhs[3][2]-coeff*lhs[3][1];\n\tlhs[4][2]=lhs[4][2]-coeff*lhs[4][1];\n\tr[2]=r[2]-coeff*r[1];\n\t/* */\n\tcoeff=lhs[1][3];\n\tlhs[2][3]=lhs[2][3]-coeff*lhs[2][1];\n\tlhs[3][3]=lhs[3][3]-coeff*lhs[3][1];\n\tlhs[4][3]=lhs[4][3]-coeff*lhs[4][1];\n\tr[3]=r[3]-coeff*r[1];\n\t/* */\n\tcoeff=lhs[1][4];\n\tlhs[2][4]=lhs[2][4]-coeff*lhs[2][1];\n\tlhs[3][4]=lhs[3][4]-coeff*lhs[3][1];\n\tlhs[4][4]=lhs[4][4]-coeff*lhs[4][1];\n\tr[4]=r[4]-coeff*r[1];\n\t/* */\n\tpivot=1.00/lhs[2][2];\n\tlhs[3][2]=lhs[3][2]*pivot;\n\tlhs[4][2]=lhs[4][2]*pivot;\n\tr[2]=r[2]*pivot;\n\t/* */\n\tcoeff=lhs[2][0];\n\tlhs[3][0]=lhs[3][0]-coeff*lhs[3][2];\n\tlhs[4][0]=lhs[4][0]-coeff*lhs[4][2];\n\tr[0]=r[0]-coeff*r[2];\n\t/* */\n\tcoeff=lhs[2][1];\n\tlhs[3][1]=lhs[3][1]-coeff*lhs[3][2];\n\tlhs[4][1]=lhs[4][1]-coeff*lhs[4][2];\n\tr[1]=r[1]-coeff*r[2];\n\t/* */\n\tcoeff=lhs[2][3];\n\tlhs[3][3]=lhs[3][3]-coeff*lhs[3][2];\n\tlhs[4][3]=lhs[4][3]-coeff*lhs[4][2];\n\tr[3]=r[3]-coeff*r[2];\n\t/* */\n\tcoeff=lhs[2][4];\n\tlhs[3][4]=lhs[3][4]-coeff*lhs[3][2];\n\tlhs[4][4]=lhs[4][4]-coeff*lhs[4][2];\n\tr[4]=r[4]-coeff*r[2];\n\t/* */\n\tpivot=1.00/lhs[3][3];\n\tlhs[4][3]=lhs[4][3]*pivot;\n\tr[3]=r[3]*pivot;\n\t/* */\n\tcoeff=lhs[3][0];\n\tlhs[4][0]=lhs[4][0]-coeff*lhs[4][3];\n\tr[0]=r[0]-coeff*r[3];\n\t/* */\n\tcoeff=lhs[3][1];\n\tlhs[4][1]=lhs[4][1]-coeff*lhs[4][3];\n\tr[1]=r[1]-coeff*r[3];\n\t/* */\n\tcoeff=lhs[3][2];\n\tlhs[4][2]=lhs[4][2]-coeff*lhs[4][3];\n\tr[2]=r[2]-coeff*r[3];\n\t/* */\n\tcoeff=lhs[3][4];\n\tlhs[4][4]=lhs[4][4]-coeff*lhs[4][3];\n\tr[4]=r[4]-coeff*r[3];\n\t/* */\n\tpivot=1.00/lhs[4][4];\n\tr[4]=r[4]*pivot;\n\t/* */\n\tcoeff=lhs[4][0];\n\tr[0]=r[0]-coeff*r[4];\n\t/* */\n\tcoeff=lhs[4][1];\n\tr[1]=r[1]-coeff*r[4];\n\t/* */\n\tcoeff=lhs[4][2];\n\tr[2]=r[2]-coeff*r[4];\n\t/* */\n\tcoeff=lhs[4][3];\n\tr[3]=r[3]-coeff*r[4];\n}\n\nvoid compute_rhs(){\n\tint i, j, k, m;\n\tdouble rho_inv, uijk, up1, um1, vijk, vp1, vm1, wijk, wp1, wm1;\n\tif(timeron){timer_start(T_RHS);}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * compute the reciprocal of density, and the kinetic energy, \n\t * and the speed of sound.\n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\trho_inv=1.0/u[k][j][i][0];\n\t\t\t\trho_i[k][j][i]=rho_inv;\n\t\t\t\tus[k][j][i]=u[k][j][i][1]*rho_inv;\n\t\t\t\tvs[k][j][i]=u[k][j][i][2]*rho_inv;\n\t\t\t\tws[k][j][i]=u[k][j][i][3]*rho_inv;\n\t\t\t\tsquare[k][j][i]=0.5*(\n\t\t\t\t\t\tu[k][j][i][1]*u[k][j][i][1]+ \n\t\t\t\t\t\tu[k][j][i][2]*u[k][j][i][2]+\n\t\t\t\t\t\tu[k][j][i][3]*u[k][j][i][3])*rho_inv;\n\t\t\t\tqs[k][j][i]=square[k][j][i]*rho_inv;\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * copy the exact forcing term to the right hand side; because \n\t * this forcing term is known, we can store it on the whole grid\n\t * including the boundary                   \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\trhs[k][j][i][m]=forcing[k][j][i][m];\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_start(T_RHSX);}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * compute xi-direction fluxes \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tuijk=us[k][j][i];\n\t\t\t\tup1=us[k][j][i+1];\n\t\t\t\tum1=us[k][j][i-1];\n\t\t\t\trhs[k][j][i][0]=rhs[k][j][i][0]+dx1tx1* \n\t\t\t\t\t(u[k][j][i+1][0]-2.0*u[k][j][i][0]+ \n\t\t\t\t\t u[k][j][i-1][0])-\n\t\t\t\t\ttx2*(u[k][j][i+1][1]-u[k][j][i-1][1]);\n\t\t\t\trhs[k][j][i][1]=rhs[k][j][i][1]+dx2tx1* \n\t\t\t\t\t(u[k][j][i+1][1]-2.0*u[k][j][i][1]+ \n\t\t\t\t\t u[k][j][i-1][1])+\n\t\t\t\t\txxcon2*con43*(up1-2.0*uijk+um1)-\n\t\t\t\t\ttx2*(u[k][j][i+1][1]*up1- \n\t\t\t\t\t\t\tu[k][j][i-1][1]*um1+\n\t\t\t\t\t\t\t(u[k][j][i+1][4]- square[k][j][i+1]-\n\t\t\t\t\t\t\t u[k][j][i-1][4]+ square[k][j][i-1])*\n\t\t\t\t\t\t\tc2);\n\t\t\t\trhs[k][j][i][2]=rhs[k][j][i][2]+dx3tx1* \n\t\t\t\t\t(u[k][j][i+1][2]-2.0*u[k][j][i][2]+\n\t\t\t\t\t u[k][j][i-1][2])+\n\t\t\t\t\txxcon2*(vs[k][j][i+1]-2.0*vs[k][j][i]+\n\t\t\t\t\t\t\tvs[k][j][i-1])-\n\t\t\t\t\ttx2*(u[k][j][i+1][2]*up1- \n\t\t\t\t\t\t\tu[k][j][i-1][2]*um1);\n\t\t\t\trhs[k][j][i][3]=rhs[k][j][i][3]+dx4tx1* \n\t\t\t\t\t(u[k][j][i+1][3]-2.0*u[k][j][i][3]+\n\t\t\t\t\t u[k][j][i-1][3])+\n\t\t\t\t\txxcon2*(ws[k][j][i+1]-2.0*ws[k][j][i]+\n\t\t\t\t\t\t\tws[k][j][i-1])-\n\t\t\t\t\ttx2*(u[k][j][i+1][3]*up1- \n\t\t\t\t\t\t\tu[k][j][i-1][3]*um1);\n\t\t\t\trhs[k][j][i][4]=rhs[k][j][i][4]+dx5tx1* \n\t\t\t\t\t(u[k][j][i+1][4]-2.0*u[k][j][i][4]+\n\t\t\t\t\t u[k][j][i-1][4])+\n\t\t\t\t\txxcon3*(qs[k][j][i+1]-2.0*qs[k][j][i]+\n\t\t\t\t\t\t\tqs[k][j][i-1])+\n\t\t\t\t\txxcon4*(up1*up1-2.0*uijk*uijk+ \n\t\t\t\t\t\t\tum1*um1)+\n\t\t\t\t\txxcon5*(u[k][j][i+1][4]*rho_i[k][j][i+1]- \n\t\t\t\t\t\t\t2.0*u[k][j][i][4]*rho_i[k][j][i]+\n\t\t\t\t\t\t\tu[k][j][i-1][4]*rho_i[k][j][i-1])-\n\t\t\t\t\ttx2*((c1*u[k][j][i+1][4]- \n\t\t\t\t\t\t\t\tc2*square[k][j][i+1])*up1-\n\t\t\t\t\t\t\t(c1*u[k][j][i-1][4]- \n\t\t\t\t\t\t\t c2*square[k][j][i-1])*um1);\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * add fourth order xi-direction dissipation               \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\ti=1;\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t(5.0*u[k][j][i][m]-4.0*u[k][j][i+1][m]+\n\t\t\t\t\t u[k][j][i+2][m]);\n\t\t\t}\n\t\t\ti=2;\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t(-4.0*u[k][j][i-1][m]+6.0*u[k][j][i][m]-\n\t\t\t\t\t 4.0*u[k][j][i+1][m]+u[k][j][i+2][m]);\n\t\t\t}\n\t\t}\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\tfor(i=3; i<=grid_points[0]-4; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp * \n\t\t\t\t\t\t(u[k][j][i-2][m]-4.0*u[k][j][i-1][m]+ \n\t\t\t\t\t\t 6.0*u[k][j][i][m]-4.0*u[k][j][i+1][m]+ \n\t\t\t\t\t\t u[k][j][i+2][m]);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\ti=grid_points[0]-3;\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp*\n\t\t\t\t\t(u[k][j][i-2][m]-4.0*u[k][j][i-1][m]+ \n\t\t\t\t\t 6.0*u[k][j][i][m]-4.0*u[k][j][i+1][m]);\n\t\t\t}\n\t\t\ti=grid_points[0]-2;\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp*\n\t\t\t\t\t(u[k][j][i-2][m]-4.*u[k][j][i-1][m]+\n\t\t\t\t\t 5.*u[k][j][i][m]);\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_RHSX);}\n\tif(timeron){timer_start(T_RHSY);}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * compute eta-direction fluxes \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tvijk=vs[k][j][i];\n\t\t\t\tvp1=vs[k][j+1][i];\n\t\t\t\tvm1=vs[k][j-1][i];\n\t\t\t\trhs[k][j][i][0]=rhs[k][j][i][0]+dy1ty1* \n\t\t\t\t\t(u[k][j+1][i][0]-2.0*u[k][j][i][0]+ \n\t\t\t\t\t u[k][j-1][i][0])-\n\t\t\t\t\tty2*(u[k][j+1][i][2]-u[k][j-1][i][2]);\n\t\t\t\trhs[k][j][i][1]=rhs[k][j][i][1]+dy2ty1* \n\t\t\t\t\t(u[k][j+1][i][1]-2.0*u[k][j][i][1]+ \n\t\t\t\t\t u[k][j-1][i][1])+\n\t\t\t\t\tyycon2*(us[k][j+1][i]-2.0*us[k][j][i]+ \n\t\t\t\t\t\t\tus[k][j-1][i])-\n\t\t\t\t\tty2*(u[k][j+1][i][1]*vp1- \n\t\t\t\t\t\t\tu[k][j-1][i][1]*vm1);\n\t\t\t\trhs[k][j][i][2]=rhs[k][j][i][2]+dy3ty1* \n\t\t\t\t\t(u[k][j+1][i][2]-2.0*u[k][j][i][2]+ \n\t\t\t\t\t u[k][j-1][i][2])+\n\t\t\t\t\tyycon2*con43*(vp1-2.0*vijk+vm1)-\n\t\t\t\t\tty2*(u[k][j+1][i][2]*vp1- \n\t\t\t\t\t\t\tu[k][j-1][i][2]*vm1+\n\t\t\t\t\t\t\t(u[k][j+1][i][4]-square[k][j+1][i]- \n\t\t\t\t\t\t\t u[k][j-1][i][4]+square[k][j-1][i])\n\t\t\t\t\t\t\t*c2);\n\t\t\t\trhs[k][j][i][3]=rhs[k][j][i][3]+dy4ty1* \n\t\t\t\t\t(u[k][j+1][i][3]-2.0*u[k][j][i][3]+ \n\t\t\t\t\t u[k][j-1][i][3])+\n\t\t\t\t\tyycon2*(ws[k][j+1][i]-2.0*ws[k][j][i]+ \n\t\t\t\t\t\t\tws[k][j-1][i])-\n\t\t\t\t\tty2*(u[k][j+1][i][3]*vp1- \n\t\t\t\t\t\t\tu[k][j-1][i][3]*vm1);\n\t\t\t\trhs[k][j][i][4]=rhs[k][j][i][4]+dy5ty1* \n\t\t\t\t\t(u[k][j+1][i][4]-2.0*u[k][j][i][4]+ \n\t\t\t\t\t u[k][j-1][i][4])+\n\t\t\t\t\tyycon3*(qs[k][j+1][i]-2.0*qs[k][j][i]+ \n\t\t\t\t\t\t\tqs[k][j-1][i])+\n\t\t\t\t\tyycon4*(vp1*vp1-2.0*vijk*vijk+ \n\t\t\t\t\t\t\tvm1*vm1)+\n\t\t\t\t\tyycon5*(u[k][j+1][i][4]*rho_i[k][j+1][i]- \n\t\t\t\t\t\t\t2.0*u[k][j][i][4]*rho_i[k][j][i]+\n\t\t\t\t\t\t\tu[k][j-1][i][4]*rho_i[k][j-1][i])-\n\t\t\t\t\tty2*((c1*u[k][j+1][i][4]- \n\t\t\t\t\t\t\t\tc2*square[k][j+1][i])*vp1-\n\t\t\t\t\t\t\t(c1*u[k][j-1][i][4]- \n\t\t\t\t\t\t\t c2*square[k][j-1][i])*vm1);\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * add fourth order eta-direction dissipation         \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tj=1;\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t(5.0*u[k][j][i][m]-4.0*u[k][j+1][i][m]+\n\t\t\t\t\t u[k][j+2][i][m]);\n\t\t\t}\n\t\t}\n\t\tj=2;\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t(-4.0*u[k][j-1][i][m]+6.0*u[k][j][i][m]-\n\t\t\t\t\t 4.0*u[k][j+1][i][m]+u[k][j+2][i][m]);\n\t\t\t}\n\t\t}\n\t\tfor(j=3; j<=grid_points[1]-4; j++){\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t\t(u[k][j-2][i][m]-4.0*u[k][j-1][i][m]+ \n\t\t\t\t\t\t 6.0*u[k][j][i][m]-4.0*u[k][j+1][i][m]+ \n\t\t\t\t\t\t u[k][j+2][i][m]);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tj=grid_points[1]-3;\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp*\n\t\t\t\t\t(u[k][j-2][i][m]-4.0*u[k][j-1][i][m]+ \n\t\t\t\t\t 6.0*u[k][j][i][m]-4.0*u[k][j+1][i][m]);\n\t\t\t}\n\t\t}\n\t\tj=grid_points[1]-2;\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp*\n\t\t\t\t\t(u[k][j-2][i][m]-4.*u[k][j-1][i][m]+\n\t\t\t\t\t 5.*u[k][j][i][m]);\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_RHSY);}\n\tif(timeron){timer_start(T_RHSZ);}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * compute zeta-direction fluxes \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\twijk=ws[k][j][i];\n\t\t\t\twp1=ws[k+1][j][i];\n\t\t\t\twm1=ws[k-1][j][i];\n\t\t\t\trhs[k][j][i][0]=rhs[k][j][i][0]+dz1tz1* \n\t\t\t\t\t(u[k+1][j][i][0]-2.0*u[k][j][i][0]+ \n\t\t\t\t\t u[k-1][j][i][0])-\n\t\t\t\t\ttz2*(u[k+1][j][i][3]-u[k-1][j][i][3]);\n\t\t\t\trhs[k][j][i][1]=rhs[k][j][i][1]+dz2tz1* \n\t\t\t\t\t(u[k+1][j][i][1]-2.0*u[k][j][i][1]+ \n\t\t\t\t\t u[k-1][j][i][1])+\n\t\t\t\t\tzzcon2*(us[k+1][j][i]-2.0*us[k][j][i]+ \n\t\t\t\t\t\t\tus[k-1][j][i])-\n\t\t\t\t\ttz2*(u[k+1][j][i][1]*wp1- \n\t\t\t\t\t\t\tu[k-1][j][i][1]*wm1);\n\t\t\t\trhs[k][j][i][2]=rhs[k][j][i][2]+dz3tz1* \n\t\t\t\t\t(u[k+1][j][i][2]-2.0*u[k][j][i][2]+ \n\t\t\t\t\t u[k-1][j][i][2])+\n\t\t\t\t\tzzcon2*(vs[k+1][j][i]-2.0*vs[k][j][i]+ \n\t\t\t\t\t\t\tvs[k-1][j][i])-\n\t\t\t\t\ttz2*(u[k+1][j][i][2]*wp1- \n\t\t\t\t\t\t\tu[k-1][j][i][2]*wm1);\n\t\t\t\trhs[k][j][i][3]=rhs[k][j][i][3]+dz4tz1* \n\t\t\t\t\t(u[k+1][j][i][3]-2.0*u[k][j][i][3]+ \n\t\t\t\t\t u[k-1][j][i][3])+\n\t\t\t\t\tzzcon2*con43*(wp1-2.0*wijk+wm1)-\n\t\t\t\t\ttz2*(u[k+1][j][i][3]*wp1- \n\t\t\t\t\t\t\tu[k-1][j][i][3]*wm1+\n\t\t\t\t\t\t\t(u[k+1][j][i][4]-square[k+1][j][i]- \n\t\t\t\t\t\t\t u[k-1][j][i][4]+square[k-1][j][i])\n\t\t\t\t\t\t\t*c2);\n\t\t\t\trhs[k][j][i][4]=rhs[k][j][i][4]+dz5tz1* \n\t\t\t\t\t(u[k+1][j][i][4]-2.0*u[k][j][i][4]+ \n\t\t\t\t\t u[k-1][j][i][4])+\n\t\t\t\t\tzzcon3*(qs[k+1][j][i]-2.0*qs[k][j][i]+ \n\t\t\t\t\t\t\tqs[k-1][j][i])+\n\t\t\t\t\tzzcon4*(wp1*wp1-2.0*wijk*wijk+ \n\t\t\t\t\t\t\twm1*wm1)+\n\t\t\t\t\tzzcon5*(u[k+1][j][i][4]*rho_i[k+1][j][i]- \n\t\t\t\t\t\t\t2.0*u[k][j][i][4]*rho_i[k][j][i]+\n\t\t\t\t\t\t\tu[k-1][j][i][4]*rho_i[k-1][j][i])-\n\t\t\t\t\ttz2*((c1*u[k+1][j][i][4]- \n\t\t\t\t\t\t\t\tc2*square[k+1][j][i])*wp1-\n\t\t\t\t\t\t\t(c1*u[k-1][j][i][4]- \n\t\t\t\t\t\t\t c2*square[k-1][j][i])*wm1);\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * add fourth order zeta-direction dissipation                \n\t * ---------------------------------------------------------------------\n\t */\n\tk=1;\n\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t(5.0*u[k][j][i][m]-4.0*u[k+1][j][i][m]+\n\t\t\t\t\t u[k+2][j][i][m]);\n\t\t\t}\n\t\t}\n\t}\n\tk=2;\n\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t(-4.0*u[k-1][j][i][m]+6.0*u[k][j][i][m]-\n\t\t\t\t\t 4.0*u[k+1][j][i][m]+u[k+2][j][i][m]);\n\t\t\t}\n\t\t}\n\t}\n\tfor(k=3; k<=grid_points[2]-4; k++){\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t\t(u[k-2][j][i][m]-4.0*u[k-1][j][i][m]+ \n\t\t\t\t\t\t 6.0*u[k][j][i][m]-4.0*u[k+1][j][i][m]+ \n\t\t\t\t\t\t u[k+2][j][i][m]);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tk=grid_points[2]-3;\n\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp*\n\t\t\t\t\t(u[k-2][j][i][m]-4.0*u[k-1][j][i][m]+ \n\t\t\t\t\t 6.0*u[k][j][i][m]-4.0*u[k+1][j][i][m]);\n\t\t\t}\n\t\t}\n\t}\n\tk=grid_points[2]-2;\n\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp*\n\t\t\t\t\t(u[k-2][j][i][m]-4.*u[k-1][j][i][m]+\n\t\t\t\t\t 5.*u[k][j][i][m]);\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_RHSZ);}\n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]*dt;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_RHS);}\n}\n\n/*\n * ---------------------------------------------------------------------\n * this function computes the norm of the difference between the\n * computed solution and the exact solution\n * ---------------------------------------------------------------------\n */\nvoid error_norm(double rms[5]){\n\tint i, j, k, m, d;\n\tdouble xi, eta, zeta, u_exact[5], add;\n\tfor(m=0;m<5;m++){rms[m]=0.0;}\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tzeta=(double)(k)*dnzm1;\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\teta=(double)(j)*dnym1;\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\txi=(double)(i)*dnxm1;\n\t\t\t\texact_solution(xi, eta, zeta, u_exact);\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tadd=u[k][j][i][m]-u_exact[m];\n\t\t\t\t\trms[m]=rms[m]+add*add;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tfor(m=0; m<5; m++){\n\t\tfor(d=0; d<3; d++){\n\t\t\trms[m]=rms[m]/(double)(grid_points[d]-2);\n\t\t}\n\t\trms[m]=sqrt(rms[m]);\n\t}\n}\n\n/*\n * ---------------------------------------------------------------------\n * compute the right hand side based on exact solution\n * ---------------------------------------------------------------------\n */\nvoid exact_rhs(){\n\tdouble dtemp[5], xi, eta, zeta, dtpp;\n\tint m, i, j, k, ip1, im1, jp1, jm1, km1, kp1;\n\t/*\n\t * ---------------------------------------------------------------------\n\t * initialize                                  \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tforcing[k][j][i][m]=0.0;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * xi-direction flux differences                      \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tzeta=(double)(k)*dnzm1;\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\teta=(double)(j)*dnym1;\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\txi=(double)(i)*dnxm1;\n\t\t\t\texact_solution(xi, eta, zeta, dtemp);\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tue[m][i]=dtemp[m];\n\t\t\t\t}\n\t\t\t\tdtpp=1.0/dtemp[0];\n\t\t\t\tfor(m=1; m<5; m++){\n\t\t\t\t\tbuf[m][i]=dtpp*dtemp[m];\n\t\t\t\t}\n\t\t\t\tcuf[i]=buf[1][i]*buf[1][i];\n\t\t\t\tbuf[0][i]=cuf[i]+buf[2][i]*buf[2][i]+buf[3][i]*buf[3][i];\n\t\t\t\tq[i]=0.5*(buf[1][i]*ue[1][i]+buf[2][i]*ue[2][i]+\n\t\t\t\t\t\tbuf[3][i]*ue[3][i]);\n\t\t\t}\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tim1=i-1;\n\t\t\t\tip1=i+1;\n\t\t\t\tforcing[k][j][i][0]=forcing[k][j][i][0]-\n\t\t\t\t\ttx2*(ue[1][ip1]-ue[1][im1])+\n\t\t\t\t\tdx1tx1*(ue[0][ip1]-2.0*ue[0][i]+ue[0][im1]);\n\t\t\t\tforcing[k][j][i][1]=forcing[k][j][i][1]-tx2*(\n\t\t\t\t\t\t(ue[1][ip1]*buf[1][ip1]+c2*(ue[4][ip1]-q[ip1]))-\n\t\t\t\t\t\t(ue[1][im1]*buf[1][im1]+c2*(ue[4][im1]-q[im1])))+\n\t\t\t\t\txxcon1*(buf[1][ip1]-2.0*buf[1][i]+buf[1][im1])+\n\t\t\t\t\tdx2tx1*(ue[1][ip1]-2.0*ue[1][i]+ue[1][im1]);\n\t\t\t\tforcing[k][j][i][2]=forcing[k][j][i][2]-tx2*(\n\t\t\t\t\t\tue[2][ip1]*buf[1][ip1]-ue[2][im1]*buf[1][im1])+\n\t\t\t\t\txxcon2*(buf[2][ip1]-2.0*buf[2][i]+buf[2][im1])+\n\t\t\t\t\tdx3tx1*(ue[2][ip1]-2.0*ue[2][i] +ue[2][im1]);\n\t\t\t\tforcing[k][j][i][3]=forcing[k][j][i][3]-tx2*(\n\t\t\t\t\t\tue[3][ip1]*buf[1][ip1]-ue[3][im1]*buf[1][im1])+\n\t\t\t\t\txxcon2*(buf[3][ip1]-2.0*buf[3][i]+buf[3][im1])+\n\t\t\t\t\tdx4tx1*(ue[3][ip1]-2.0*ue[3][i]+ue[3][im1]);\n\t\t\t\tforcing[k][j][i][4]=forcing[k][j][i][4]-tx2*(\n\t\t\t\t\t\tbuf[1][ip1]*(c1*ue[4][ip1]-c2*q[ip1])-\n\t\t\t\t\t\tbuf[1][im1]*(c1*ue[4][im1]-c2*q[im1]))+\n\t\t\t\t\t0.5*xxcon3*(buf[0][ip1]-2.0*buf[0][i]+\n\t\t\t\t\t\t\tbuf[0][im1])+\n\t\t\t\t\txxcon4*(cuf[ip1]-2.0*cuf[i]+cuf[im1])+\n\t\t\t\t\txxcon5*(buf[4][ip1]-2.0*buf[4][i]+buf[4][im1])+\n\t\t\t\t\tdx5tx1*(ue[4][ip1]-2.0*ue[4][i]+ue[4][im1]);\n\t\t\t}\n\t\t\t/* \n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * fourth-order dissipation                         \n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\ti=1;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(5.0*ue[m][i]-4.0*ue[m][i+1]+ue[m][i+2]);\n\t\t\t\ti=2;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(-4.0*ue[m][i-1]+6.0*ue[m][i]-\n\t\t\t\t\t 4.0*ue[m][i+1]+ue[m][i+2]);\n\t\t\t}\n\t\t\tfor(i=3; i<=grid_points[0]-4; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t\t(ue[m][i-2]-4.0*ue[m][i-1]+\n\t\t\t\t\t\t 6.0*ue[m][i]-4.0*ue[m][i+1]+ue[m][i+2]);\n\t\t\t\t}\n\t\t\t}\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\ti=grid_points[0]-3;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(ue[m][i-2]-4.0*ue[m][i-1]+\n\t\t\t\t\t 6.0*ue[m][i]-4.0*ue[m][i+1]);\n\t\t\t\ti=grid_points[0]-2;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(ue[m][i-2]-4.0*ue[m][i-1]+5.0*ue[m][i]);\n\t\t\t}\n\t\t}\n\t}\n\t/* \n\t * ---------------------------------------------------------------------\n\t * eta-direction flux differences             \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tzeta=(double)(k)*dnzm1;\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\txi=(double)(i)*dnxm1;\n\t\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\t\teta=(double)(j)*dnym1;\n\t\t\t\texact_solution(xi, eta, zeta, dtemp);\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tue[m][j]=dtemp[m];\n\t\t\t\t}\n\t\t\t\tdtpp=1.0/dtemp[0];\n\t\t\t\tfor(m=1; m<5; m++){\n\t\t\t\t\tbuf[m][j]=dtpp*dtemp[m];\n\t\t\t\t}\n\t\t\t\tcuf[j]=buf[2][j]*buf[2][j];\n\t\t\t\tbuf[0][j]=cuf[j]+buf[1][j]*buf[1][j]+buf[3][j]*buf[3][j];\n\t\t\t\tq[j]=0.5*(buf[1][j]*ue[1][j]+buf[2][j]*ue[2][j]+\n\t\t\t\t\t\tbuf[3][j]*ue[3][j]);\n\t\t\t}\n\t\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\t\tjm1=j-1;\n\t\t\t\tjp1=j+1;\n\t\t\t\tforcing[k][j][i][0]=forcing[k][j][i][0]-\n\t\t\t\t\tty2*(ue[2][jp1]-ue[2][jm1])+\n\t\t\t\t\tdy1ty1*(ue[0][jp1]-2.0*ue[0][j]+ue[0][jm1]);\n\t\t\t\tforcing[k][j][i][1]=forcing[k][j][i][1]-ty2*(\n\t\t\t\t\t\tue[1][jp1]*buf[2][jp1]-ue[1][jm1]*buf[2][jm1])+\n\t\t\t\t\tyycon2*(buf[1][jp1]-2.0*buf[1][j]+buf[1][jm1])+\n\t\t\t\t\tdy2ty1*(ue[1][jp1]-2.0*ue[1][j]+ue[1][jm1]);\n\t\t\t\tforcing[k][j][i][2]=forcing[k][j][i][2]-ty2*(\n\t\t\t\t\t\t(ue[2][jp1]*buf[2][jp1]+c2*(ue[4][jp1]-q[jp1]))-\n\t\t\t\t\t\t(ue[2][jm1]*buf[2][jm1]+c2*(ue[4][jm1]-q[jm1])))+\n\t\t\t\t\tyycon1*(buf[2][jp1]-2.0*buf[2][j]+buf[2][jm1])+\n\t\t\t\t\tdy3ty1*(ue[2][jp1]-2.0*ue[2][j]+ue[2][jm1]);\n\t\t\t\tforcing[k][j][i][3]=forcing[k][j][i][3]-ty2*(\n\t\t\t\t\t\tue[3][jp1]*buf[2][jp1]-ue[3][jm1]*buf[2][jm1])+\n\t\t\t\t\tyycon2*(buf[3][jp1]-2.0*buf[3][j]+buf[3][jm1])+\n\t\t\t\t\tdy4ty1*(ue[3][jp1]-2.0*ue[3][j]+ue[3][jm1]);\n\t\t\t\tforcing[k][j][i][4]=forcing[k][j][i][4]-ty2*(\n\t\t\t\t\t\tbuf[2][jp1]*(c1*ue[4][jp1]-c2*q[jp1])-\n\t\t\t\t\t\tbuf[2][jm1]*(c1*ue[4][jm1]-c2*q[jm1]))+\n\t\t\t\t\t0.5*yycon3*(buf[0][jp1]-2.0*buf[0][j]+\n\t\t\t\t\t\t\tbuf[0][jm1])+\n\t\t\t\t\tyycon4*(cuf[jp1]-2.0*cuf[j]+cuf[jm1])+\n\t\t\t\t\tyycon5*(buf[4][jp1]-2.0*buf[4][j]+buf[4][jm1])+\n\t\t\t\t\tdy5ty1*(ue[4][jp1]-2.0*ue[4][j]+ue[4][jm1]);\n\t\t\t}\n\t\t\t/* \n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * fourth-order dissipation                      \n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tj=1;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(5.0*ue[m][j]-4.0*ue[m][j+1] +ue[m][j+2]);\n\t\t\t\tj=2;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(-4.0*ue[m][j-1]+6.0*ue[m][j]-\n\t\t\t\t\t 4.0*ue[m][j+1]+ue[m][j+2]);\n\t\t\t}\n\t\t\tfor(j=3; j<=grid_points[1]-4; j++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t\t(ue[m][j-2]-4.0*ue[m][j-1]+\n\t\t\t\t\t\t 6.0*ue[m][j]-4.0*ue[m][j+1]+ue[m][j+2]);\n\t\t\t\t}\n\t\t\t}\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tj=grid_points[1]-3;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(ue[m][j-2]-4.0*ue[m][j-1]+\n\t\t\t\t\t 6.0*ue[m][j]-4.0*ue[m][j+1]);\n\t\t\t\tj=grid_points[1]-2;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(ue[m][j-2]-4.0*ue[m][j-1]+5.0*ue[m][j]);\n\t\t\t}\n\t\t}\n\t}\n\t/* \n\t * ---------------------------------------------------------------------\n\t * zeta-direction flux differences                      \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\teta=(double)(j)*dnym1;\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\txi=(double)(i)*dnxm1;\n\t\t\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\t\t\tzeta=(double)(k)*dnzm1;\n\t\t\t\texact_solution(xi, eta, zeta, dtemp);\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tue[m][k]=dtemp[m];\n\t\t\t\t}\n\t\t\t\tdtpp=1.0/dtemp[0];\n\t\t\t\tfor(m=1; m<5; m++){\n\t\t\t\t\tbuf[m][k]=dtpp*dtemp[m];\n\t\t\t\t}\n\t\t\t\tcuf[k]=buf[3][k]*buf[3][k];\n\t\t\t\tbuf[0][k]=cuf[k]+buf[1][k]*buf[1][k]+buf[2][k]*buf[2][k];\n\t\t\t\tq[k]=0.5*(buf[1][k]*ue[1][k]+buf[2][k]*ue[2][k]+\n\t\t\t\t\t\tbuf[3][k]*ue[3][k]);\n\t\t\t}\n\t\t\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\t\t\tkm1=k-1;\n\t\t\t\tkp1=k+1;\n\t\t\t\tforcing[k][j][i][0]=forcing[k][j][i][0]-\n\t\t\t\t\ttz2*(ue[3][kp1]-ue[3][km1])+\n\t\t\t\t\tdz1tz1*(ue[0][kp1]-2.0*ue[0][k]+ue[0][km1]);\n\t\t\t\tforcing[k][j][i][1]=forcing[k][j][i][1]-tz2*(\n\t\t\t\t\t\tue[1][kp1]*buf[3][kp1]-ue[1][km1]*buf[3][km1])+\n\t\t\t\t\tzzcon2*(buf[1][kp1]-2.0*buf[1][k]+buf[1][km1])+\n\t\t\t\t\tdz2tz1*(ue[1][kp1]-2.0*ue[1][k]+ue[1][km1]);\n\t\t\t\tforcing[k][j][i][2]=forcing[k][j][i][2]-tz2*(\n\t\t\t\t\t\tue[2][kp1]*buf[3][kp1]-ue[2][km1]*buf[3][km1])+\n\t\t\t\t\tzzcon2*(buf[2][kp1]-2.0*buf[2][k]+buf[2][km1])+\n\t\t\t\t\tdz3tz1*(ue[2][kp1]-2.0*ue[2][k]+ue[2][km1]);\n\t\t\t\tforcing[k][j][i][3]=forcing[k][j][i][3]-tz2*(\n\t\t\t\t\t\t(ue[3][kp1]*buf[3][kp1]+c2*(ue[4][kp1]-q[kp1]))-\n\t\t\t\t\t\t(ue[3][km1]*buf[3][km1]+c2*(ue[4][km1]-q[km1])))+\n\t\t\t\t\tzzcon1*(buf[3][kp1]-2.0*buf[3][k]+buf[3][km1])+\n\t\t\t\t\tdz4tz1*(ue[3][kp1]-2.0*ue[3][k]+ue[3][km1]);\n\t\t\t\tforcing[k][j][i][4]=forcing[k][j][i][4]-tz2*(\n\t\t\t\t\t\tbuf[3][kp1]*(c1*ue[4][kp1]-c2*q[kp1])-\n\t\t\t\t\t\tbuf[3][km1]*(c1*ue[4][km1]-c2*q[km1]))+\n\t\t\t\t\t0.5*zzcon3*(buf[0][kp1]-2.0*buf[0][k]\n\t\t\t\t\t\t\t+buf[0][km1])+\n\t\t\t\t\tzzcon4*(cuf[kp1]-2.0*cuf[k]+cuf[km1])+\n\t\t\t\t\tzzcon5*(buf[4][kp1]-2.0*buf[4][k]+buf[4][km1])+\n\t\t\t\t\tdz5tz1*(ue[4][kp1]-2.0*ue[4][k]+ue[4][km1]);\n\t\t\t}\n\t\t\t/* \n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * fourth-order dissipation                        \n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tk=1;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(5.0*ue[m][k]-4.0*ue[m][k+1]+ue[m][k+2]);\n\t\t\t\tk=2;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(-4.0*ue[m][k-1]+6.0*ue[m][k]-\n\t\t\t\t\t 4.0*ue[m][k+1]+ue[m][k+2]);\n\t\t\t}\n\t\t\tfor(k=3; k<=grid_points[2]-4; k++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t\t(ue[m][k-2]-4.0*ue[m][k-1]+\n\t\t\t\t\t\t 6.0*ue[m][k]-4.0*ue[m][k+1]+ue[m][k+2]);\n\t\t\t\t}\n\t\t\t}\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tk=grid_points[2]-3;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(ue[m][k-2]-4.0*ue[m][k-1]+\n\t\t\t\t\t 6.0*ue[m][k]-4.0*ue[m][k+1]);\n\t\t\t\tk=grid_points[2]-2;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(ue[m][k-2]-4.0*ue[m][k-1]+5.0*ue[m][k]);\n\t\t\t}\n\t\t}\n\t}\n\t/* \n\t * ---------------------------------------------------------------------\n\t * now change the sign of the forcing function\n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tforcing[k][j][i][m]=-1.0*forcing[k][j][i][m];\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n}\n\n/* \n * ---------------------------------------------------------------------\n * this function returns the exact solution at point xi, eta, zeta  \n * ---------------------------------------------------------------------\n */\nvoid exact_solution(double xi, double eta, double zeta, double dtemp[5]){\n\tint m;\n\tfor(m=0; m<5; m++){\n\t\tdtemp[m]=ce[0][m]+\n\t\t\txi*(ce[1][m]+\n\t\t\t\t\txi*(ce[4][m]+\n\t\t\t\t\t\txi*(ce[7][m]+\n\t\t\t\t\t\t\txi*ce[10][m])))+\n\t\t\teta*(ce[2][m]+\n\t\t\t\t\teta*(ce[5][m]+\n\t\t\t\t\t\teta*(ce[8][m]+\n\t\t\t\t\t\t\teta*ce[11][m])))+\n\t\t\tzeta*(ce[3][m]+\n\t\t\t\t\tzeta*(ce[6][m]+\n\t\t\t\t\t\tzeta*(ce[9][m]+ \n\t\t\t\t\t\t\tzeta*ce[12][m])));\n\t}\n}\n\n/* \n * ---------------------------------------------------------------------\n * this subroutine initializes the field variable u using \n * tri-linear transfinite interpolation of the boundary values     \n * ---------------------------------------------------------------------\n */\nvoid initialize(){\n\tint i, j, k, m, ix, iy, iz;\n\tdouble xi, eta, zeta, Pface[2][3][5], Pxi, Peta, Pzeta, temp[5];\n\t/* \n\t * ---------------------------------------------------------------------\n\t * later (in compute_rhs) we compute 1/u for every element. a few of \n\t * the corner elements are not used, but it convenient (and faster) \n\t * to compute the whole thing with a simple loop. make sure those \n\t * values are nonzero by initializing the whole thing here. \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tu[k][j][i][m]=1.0;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\t/* \n\t * ---------------------------------------------------------------------\n\t * first store the \"interpolated\" values everywhere on the grid    \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tzeta=(double)(k)* dnzm1;\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\teta=(double)(j)*dnym1;\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\txi=(double)(i)*dnxm1;\n\t\t\t\tfor(ix=0; ix<2; ix++){\n\t\t\t\t\texact_solution((double)ix, eta, zeta, &Pface[ix][0][0]);\n\t\t\t\t}\n\t\t\t\tfor(iy=0; iy<2; iy++){\n\t\t\t\t\texact_solution(xi, (double)iy , zeta, &Pface[iy][1][0]);\n\t\t\t\t}\n\t\t\t\tfor(iz=0; iz<2; iz++){\n\t\t\t\t\texact_solution(xi, eta, (double)iz, &Pface[iz][2][0]);\n\t\t\t\t}\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tPxi=xi*Pface[1][0][m]+(1.0-xi)*Pface[0][0][m];\n\t\t\t\t\tPeta=eta*Pface[1][1][m]+(1.0-eta)*Pface[0][1][m];\n\t\t\t\t\tPzeta=zeta*Pface[1][2][m]+(1.0-zeta)*Pface[0][2][m];\n\t\t\t\t\tu[k][j][i][m]=Pxi+Peta+Pzeta- \n\t\t\t\t\t\tPxi*Peta-Pxi*Pzeta-Peta*Pzeta+ \n\t\t\t\t\t\tPxi*Peta*Pzeta;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\t/* \n\t * ---------------------------------------------------------------------\n\t * now store the exact values on the boundaries        \n\t * ---------------------------------------------------------------------\n\t * west face                                                  \n\t * ---------------------------------------------------------------------\n\t */\n\ti=0;\n\txi=0.0;\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tzeta=(double)(k)*dnzm1;\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\teta=(double)(j)*dnym1;\n\t\t\texact_solution(xi, eta, zeta, temp);\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tu[k][j][i][m]=temp[m];\n\t\t\t}\n\t\t}\n\t}\n\t/* \n\t * ---------------------------------------------------------------------\n\t * east face                                                      \n\t * ---------------------------------------------------------------------\n\t */\n\ti=grid_points[0]-1;\n\txi=1.0;\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tzeta=(double)(k)*dnzm1;\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\teta=(double)(j)*dnym1;\n\t\t\texact_solution(xi, eta, zeta, temp);\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tu[k][j][i][m]=temp[m];\n\t\t\t}\n\t\t}\n\t}\n\t/* \n\t * ---------------------------------------------------------------------\n\t * south face                                                 \n\t * ---------------------------------------------------------------------\n\t */\n\tj=0;\n\teta=0.0;\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tzeta=(double)(k)*dnzm1;\n\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\txi=(double)(i)*dnxm1;\n\t\t\texact_solution(xi, eta, zeta, temp);\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tu[k][j][i][m]=temp[m];\n\t\t\t}\n\t\t}\n\t}\n\t/* \n\t * ---------------------------------------------------------------------\n\t * north face                                    \n\t * ---------------------------------------------------------------------\n\t */\n\tj=grid_points[1]-1;\n\teta=1.0;\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tzeta=(double)(k)*dnzm1;\n\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\txi=(double)(i)*dnxm1;\n\t\t\texact_solution(xi, eta, zeta, temp);\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tu[k][j][i][m]=temp[m];\n\t\t\t}\n\t\t}\n\t}\n\t/* \n\t * ---------------------------------------------------------------------\n\t * bottom face                                       \n\t * ---------------------------------------------------------------------\n\t */\n\tk=0;\n\tzeta=0.0;\n\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\teta=(double)(j)*dnym1;\n\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\txi=(double)(i)*dnxm1;\n\t\t\texact_solution(xi, eta, zeta, temp);\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tu[k][j][i][m]=temp[m];\n\t\t\t}\n\t\t}\n\t}\n\t/* \n\t * ---------------------------------------------------------------------\n\t * top face     \n\t * ---------------------------------------------------------------------\n\t */\n\tk=grid_points[2]-1;\n\tzeta=1.0;\n\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\teta=(double)(j)*dnym1;\n\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\txi=(double)(i)*dnxm1;\n\t\t\texact_solution(xi, eta, zeta, temp);\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tu[k][j][i][m]=temp[m];\n\t\t\t}\n\t\t}\n\t}\n}\n\nvoid lhsinit(double lhs[][3][5][5], int size){\n\tint i, m, n;\n\ti=size;\n\t/* \n\t * ---------------------------------------------------------------------\n\t * zero the whole left hand side for starters\n\t * ---------------------------------------------------------------------\n\t */\n\tfor(m=0; m<5; m++){\n\t\tfor(n=0; n<5; n++){\t\t\n\t\t\tlhs[0][0][n][m]=0.0;\n\t\t\tlhs[0][1][n][m]=0.0;\n\t\t\tlhs[0][2][n][m]=0.0;\n\t\t\tlhs[i][0][n][m]=0.0;\n\t\t\tlhs[i][1][n][m]=0.0;\n\t\t\tlhs[i][2][n][m]=0.0;\n\t\t}\n\t}\n\t/* \n\t * ---------------------------------------------------------------------\n\t * next, set all diagonal values to 1. This is overkill, but convenient\n\t * ---------------------------------------------------------------------\n\t */\n\tfor(m=0; m<5; m++){\n\t\tlhs[0][1][m][m]=1.0;\n\t\tlhs[i][1][m][m]=1.0;\n\t}\n}\n\n/*\n * ---------------------------------------------------------------------\n * subtracts a(i,j,k) X b(i,j,k) from c(i,j,k)\n * ---------------------------------------------------------------------\n */\nvoid matmul_sub(double ablock[5][5], double bblock[5][5], double cblock[5][5]){\n\tcblock[0][0]=cblock[0][0]-ablock[0][0]*bblock[0][0]\n\t\t-ablock[1][0]*bblock[0][1]\n\t\t-ablock[2][0]*bblock[0][2]\n\t\t-ablock[3][0]*bblock[0][3]\n\t\t-ablock[4][0]*bblock[0][4];\n\tcblock[0][1]=cblock[0][1]-ablock[0][1]*bblock[0][0]\n\t\t-ablock[1][1]*bblock[0][1]\n\t\t-ablock[2][1]*bblock[0][2]\n\t\t-ablock[3][1]*bblock[0][3]\n\t\t-ablock[4][1]*bblock[0][4];\n\tcblock[0][2]=cblock[0][2]-ablock[0][2]*bblock[0][0]\n\t\t-ablock[1][2]*bblock[0][1]\n\t\t-ablock[2][2]*bblock[0][2]\n\t\t-ablock[3][2]*bblock[0][3]\n\t\t-ablock[4][2]*bblock[0][4];\n\tcblock[0][3]=cblock[0][3]-ablock[0][3]*bblock[0][0]\n\t\t-ablock[1][3]*bblock[0][1]\n\t\t-ablock[2][3]*bblock[0][2]\n\t\t-ablock[3][3]*bblock[0][3]\n\t\t-ablock[4][3]*bblock[0][4];\n\tcblock[0][4]=cblock[0][4]-ablock[0][4]*bblock[0][0]\n\t\t-ablock[1][4]*bblock[0][1]\n\t\t-ablock[2][4]*bblock[0][2]\n\t\t-ablock[3][4]*bblock[0][3]\n\t\t-ablock[4][4]*bblock[0][4];\n\tcblock[1][0]=cblock[1][0]-ablock[0][0]*bblock[1][0]\n\t\t-ablock[1][0]*bblock[1][1]\n\t\t-ablock[2][0]*bblock[1][2]\n\t\t-ablock[3][0]*bblock[1][3]\n\t\t-ablock[4][0]*bblock[1][4];\n\tcblock[1][1]=cblock[1][1]-ablock[0][1]*bblock[1][0]\n\t\t-ablock[1][1]*bblock[1][1]\n\t\t-ablock[2][1]*bblock[1][2]\n\t\t-ablock[3][1]*bblock[1][3]\n\t\t-ablock[4][1]*bblock[1][4];\n\tcblock[1][2]=cblock[1][2]-ablock[0][2]*bblock[1][0]\n\t\t-ablock[1][2]*bblock[1][1]\n\t\t-ablock[2][2]*bblock[1][2]\n\t\t-ablock[3][2]*bblock[1][3]\n\t\t-ablock[4][2]*bblock[1][4];\n\tcblock[1][3]=cblock[1][3]-ablock[0][3]*bblock[1][0]\n\t\t-ablock[1][3]*bblock[1][1]\n\t\t-ablock[2][3]*bblock[1][2]\n\t\t-ablock[3][3]*bblock[1][3]\n\t\t-ablock[4][3]*bblock[1][4];\n\tcblock[1][4]=cblock[1][4]-ablock[0][4]*bblock[1][0]\n\t\t-ablock[1][4]*bblock[1][1]\n\t\t-ablock[2][4]*bblock[1][2]\n\t\t-ablock[3][4]*bblock[1][3]\n\t\t-ablock[4][4]*bblock[1][4];\n\tcblock[2][0]=cblock[2][0]-ablock[0][0]*bblock[2][0]\n\t\t-ablock[1][0]*bblock[2][1]\n\t\t-ablock[2][0]*bblock[2][2]\n\t\t-ablock[3][0]*bblock[2][3]\n\t\t-ablock[4][0]*bblock[2][4];\n\tcblock[2][1]=cblock[2][1]-ablock[0][1]*bblock[2][0]\n\t\t-ablock[1][1]*bblock[2][1]\n\t\t-ablock[2][1]*bblock[2][2]\n\t\t-ablock[3][1]*bblock[2][3]\n\t\t-ablock[4][1]*bblock[2][4];\n\tcblock[2][2]=cblock[2][2]-ablock[0][2]*bblock[2][0]\n\t\t-ablock[1][2]*bblock[2][1]\n\t\t-ablock[2][2]*bblock[2][2]\n\t\t-ablock[3][2]*bblock[2][3]\n\t\t-ablock[4][2]*bblock[2][4];\n\tcblock[2][3]=cblock[2][3]-ablock[0][3]*bblock[2][0]\n\t\t-ablock[1][3]*bblock[2][1]\n\t\t-ablock[2][3]*bblock[2][2]\n\t\t-ablock[3][3]*bblock[2][3]\n\t\t-ablock[4][3]*bblock[2][4];\n\tcblock[2][4]=cblock[2][4]-ablock[0][4]*bblock[2][0]\n\t\t-ablock[1][4]*bblock[2][1]\n\t\t-ablock[2][4]*bblock[2][2]\n\t\t-ablock[3][4]*bblock[2][3]\n\t\t-ablock[4][4]*bblock[2][4];\n\tcblock[3][0]=cblock[3][0]-ablock[0][0]*bblock[3][0]\n\t\t-ablock[1][0]*bblock[3][1]\n\t\t-ablock[2][0]*bblock[3][2]\n\t\t-ablock[3][0]*bblock[3][3]\n\t\t-ablock[4][0]*bblock[3][4];\n\tcblock[3][1]=cblock[3][1]-ablock[0][1]*bblock[3][0]\n\t\t-ablock[1][1]*bblock[3][1]\n\t\t-ablock[2][1]*bblock[3][2]\n\t\t-ablock[3][1]*bblock[3][3]\n\t\t-ablock[4][1]*bblock[3][4];\n\tcblock[3][2]=cblock[3][2]-ablock[0][2]*bblock[3][0]\n\t\t-ablock[1][2]*bblock[3][1]\n\t\t-ablock[2][2]*bblock[3][2]\n\t\t-ablock[3][2]*bblock[3][3]\n\t\t-ablock[4][2]*bblock[3][4];\n\tcblock[3][3]=cblock[3][3]-ablock[0][3]*bblock[3][0]\n\t\t-ablock[1][3]*bblock[3][1]\n\t\t-ablock[2][3]*bblock[3][2]\n\t\t-ablock[3][3]*bblock[3][3]\n\t\t-ablock[4][3]*bblock[3][4];\n\tcblock[3][4]=cblock[3][4]-ablock[0][4]*bblock[3][0]\n\t\t-ablock[1][4]*bblock[3][1]\n\t\t-ablock[2][4]*bblock[3][2]\n\t\t-ablock[3][4]*bblock[3][3]\n\t\t-ablock[4][4]*bblock[3][4];\n\tcblock[4][0]=cblock[4][0]-ablock[0][0]*bblock[4][0]\n\t\t-ablock[1][0]*bblock[4][1]\n\t\t-ablock[2][0]*bblock[4][2]\n\t\t-ablock[3][0]*bblock[4][3]\n\t\t-ablock[4][0]*bblock[4][4];\n\tcblock[4][1]=cblock[4][1]-ablock[0][1]*bblock[4][0]\n\t\t-ablock[1][1]*bblock[4][1]\n\t\t-ablock[2][1]*bblock[4][2]\n\t\t-ablock[3][1]*bblock[4][3]\n\t\t-ablock[4][1]*bblock[4][4];\n\tcblock[4][2]=cblock[4][2]-ablock[0][2]*bblock[4][0]\n\t\t-ablock[1][2]*bblock[4][1]\n\t\t-ablock[2][2]*bblock[4][2]\n\t\t-ablock[3][2]*bblock[4][3]\n\t\t-ablock[4][2]*bblock[4][4];\n\tcblock[4][3]=cblock[4][3]-ablock[0][3]*bblock[4][0]\n\t\t-ablock[1][3]*bblock[4][1]\n\t\t-ablock[2][3]*bblock[4][2]\n\t\t-ablock[3][3]*bblock[4][3]\n\t\t-ablock[4][3]*bblock[4][4];\n\tcblock[4][4]=cblock[4][4]-ablock[0][4]*bblock[4][0]\n\t\t-ablock[1][4]*bblock[4][1]\n\t\t-ablock[2][4]*bblock[4][2]\n\t\t-ablock[3][4]*bblock[4][3]\n\t\t-ablock[4][4]*bblock[4][4];\n}\n\n/*\n * ---------------------------------------------------------------------\n * subtracts bvec=bvec - ablock*avec\n * ---------------------------------------------------------------------\n */\nvoid matvec_sub(double ablock[5][5], double avec[5], double bvec[5]){\n\t/*\n\t * ---------------------------------------------------------------------\n\t * rhs[kc][jc][ic][i] = rhs[kc][jc][ic][i] - lhs[ia][ablock][0][i]*\n\t * ---------------------------------------------------------------------\n\t */\n\tbvec[0]=bvec[0]-ablock[0][0]*avec[0]\n\t\t-ablock[1][0]*avec[1]\n\t\t-ablock[2][0]*avec[2]\n\t\t-ablock[3][0]*avec[3]\n\t\t-ablock[4][0]*avec[4];\n\tbvec[1]=bvec[1]-ablock[0][1]*avec[0]\n\t\t-ablock[1][1]*avec[1]\n\t\t-ablock[2][1]*avec[2]\n\t\t-ablock[3][1]*avec[3]\n\t\t-ablock[4][1]*avec[4];\n\tbvec[2]=bvec[2]-ablock[0][2]*avec[0]\n\t\t-ablock[1][2]*avec[1]\n\t\t-ablock[2][2]*avec[2]\n\t\t-ablock[3][2]*avec[3]\n\t\t-ablock[4][2]*avec[4];\n\tbvec[3]=bvec[3]-ablock[0][3]*avec[0]\n\t\t-ablock[1][3]*avec[1]\n\t\t-ablock[2][3]*avec[2]\n\t\t-ablock[3][3]*avec[3]\n\t\t-ablock[4][3]*avec[4];\n\tbvec[4]=bvec[4]-ablock[0][4]*avec[0]\n\t\t-ablock[1][4]*avec[1]\n\t\t-ablock[2][4]*avec[2]\n\t\t-ablock[3][4]*avec[3]\n\t\t-ablock[4][4]*avec[4];\n}\n\nvoid rhs_norm(double rms[5]){\n\tint i, j, k, d, m;\n\tdouble add;\n\tfor(m=0; m<5; m++){\n\t\trms[m]=0.0;\n\t} \n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tfor(m=0; m<5; m++) {\n\t\t\t\t\tadd=rhs[k][j][i][m];\n\t\t\t\t\trms[m]=rms[m]+add*add;\n\t\t\t\t} \n\t\t\t} \n\t\t} \n\t}\n\tfor(m=0; m<5; m++){\n\t\tfor(d=0; d<3; d++){\n\t\t\trms[m]=rms[m]/(double)(grid_points[d]-2);\n\t\t} \n\t\trms[m]=sqrt(rms[m]);\n\t} \n}\n\nvoid set_constants(){\n\tce[0][0]=2.0;\n\tce[1][0]=0.0;\n\tce[2][0]=0.0;\n\tce[3][0]=4.0;\n\tce[4][0]=5.0;\n\tce[5][0]=3.0;\n\tce[6][0]=0.5;\n\tce[7][0]=0.02;\n\tce[8][0]=0.01;\n\tce[9][0]=0.03;\n\tce[10][0]=0.5;\n\tce[11][0]=0.4;\n\tce[12][0]=0.3;\n\t/* */\n\tce[0][1]=1.0;\n\tce[1][1]=0.0;\n\tce[2][1]=0.0;\n\tce[3][1]=0.0;\n\tce[4][1]=1.0;\n\tce[5][1]=2.0;\n\tce[6][1]=3.0;\n\tce[7][1]=0.01;\n\tce[8][1]=0.03;\n\tce[9][1]=0.02;\n\tce[10][1]=0.4;\n\tce[11][1]=0.3;\n\tce[12][1]=0.5;\n\t/* */\n\tce[0][2]=2.0;\n\tce[1][2]=2.0;\n\tce[2][2]=0.0;\n\tce[3][2]=0.0;\n\tce[4][2]=0.0;\n\tce[5][2]=2.0;\n\tce[6][2]=3.0;\n\tce[7][2]=0.04;\n\tce[8][2]=0.03;\n\tce[9][2]=0.05;\n\tce[10][2]=0.3;\n\tce[11][2]=0.5;\n\tce[12][2]=0.4;\n\t/* */\n\tce[0][3]=2.0;\n\tce[1][3]=2.0;\n\tce[2][3]=0.0;\n\tce[3][3]=0.0;\n\tce[4][3]=0.0;\n\tce[5][3]=2.0;\n\tce[6][3]=3.0;\n\tce[7][3]=0.03;\n\tce[8][3]=0.05;\n\tce[9][3]=0.04;\n\tce[10][3]=0.2;\n\tce[11][3]=0.1;\n\tce[12][3]=0.3;\n\t/* */\n\tce[0][4]=5.0;\n\tce[1][4]=4.0;\n\tce[2][4]=3.0;\n\tce[3][4]=2.0;\n\tce[4][4]=0.1;\n\tce[5][4]=0.4;\n\tce[6][4]=0.3;\n\tce[7][4]=0.05;\n\tce[8][4]=0.04;\n\tce[9][4]=0.03;\n\tce[10][4]=0.1;\n\tce[11][4]=0.3;\n\tce[12][4]=0.2;\n\t/* */\n\tc1=1.4;\n\tc2=0.4;\n\tc3=0.1;\n\tc4=1.0;\n\tc5=1.4;\n\tdnxm1=1.0/(double)(grid_points[0]-1);\n\tdnym1=1.0/(double)(grid_points[1]-1);\n\tdnzm1=1.0/(double)(grid_points[2]-1);\n\tc1c2=c1*c2;\n\tc1c5=c1*c5;\n\tc3c4=c3*c4;\n\tc1345=c1c5*c3c4;\n\tconz1=(1.0-c1c5);\n\ttx1=1.0/(dnxm1*dnxm1);\n\ttx2=1.0/(2.0*dnxm1);\n\ttx3=1.0/dnxm1;\n\tty1=1.0/(dnym1*dnym1);\n\tty2=1.0/(2.0*dnym1);\n\tty3=1.0/dnym1;\n\ttz1=1.0/(dnzm1*dnzm1);\n\ttz2=1.0/(2.0*dnzm1);\n\ttz3=1.0/dnzm1;\n\tdx1=0.75;\n\tdx2=0.75;\n\tdx3=0.75;\n\tdx4=0.75;\n\tdx5=0.75;\n\tdy1=0.75;\n\tdy2=0.75;\n\tdy3=0.75;\n\tdy4=0.75;\n\tdy5=0.75;\n\tdz1=1.0;\n\tdz2=1.0;\n\tdz3=1.0;\n\tdz4=1.0;\n\tdz5=1.0;\n\tdxmax=max(dx3, dx4);\n\tdymax=max(dy2, dy4);\n\tdzmax=max(dz2, dz3);\n\tdssp=0.25*max(dx1, max(dy1, dz1));\n\tc4dssp=4.0*dssp;\n\tc5dssp=5.0*dssp;\n\tdttx1=dt*tx1;\n\tdttx2=dt*tx2;\n\tdtty1=dt*ty1;\n\tdtty2=dt*ty2;\n\tdttz1=dt*tz1;\n\tdttz2=dt*tz2;\n\tc2dttx1=2.0*dttx1;\n\tc2dtty1=2.0*dtty1;\n\tc2dttz1=2.0*dttz1;\n\tdtdssp=dt*dssp;\n\tcomz1=dtdssp;\n\tcomz4=4.0*dtdssp;\n\tcomz5=5.0*dtdssp;\n\tcomz6=6.0*dtdssp;\n\tc3c4tx3=c3c4*tx3;\n\tc3c4ty3=c3c4*ty3;\n\tc3c4tz3=c3c4*tz3;\n\tdx1tx1=dx1*tx1;\n\tdx2tx1=dx2*tx1;\n\tdx3tx1=dx3*tx1;\n\tdx4tx1=dx4*tx1;\n\tdx5tx1=dx5*tx1;\n\tdy1ty1=dy1*ty1;\n\tdy2ty1=dy2*ty1;\n\tdy3ty1=dy3*ty1;\n\tdy4ty1=dy4*ty1;\n\tdy5ty1=dy5*ty1;\n\tdz1tz1=dz1*tz1;\n\tdz2tz1=dz2*tz1;\n\tdz3tz1=dz3*tz1;\n\tdz4tz1=dz4*tz1;\n\tdz5tz1=dz5*tz1;\n\tc2iv=2.5;\n\tcon43=4.0/3.0;\n\tcon16=1.0/6.0;\n\txxcon1=c3c4tx3*con43*tx3;\n\txxcon2=c3c4tx3*tx3;\n\txxcon3=c3c4tx3*conz1*tx3;\n\txxcon4=c3c4tx3*con16*tx3;\n\txxcon5=c3c4tx3*c1c5*tx3;\n\tyycon1=c3c4ty3*con43*ty3;\n\tyycon2=c3c4ty3*ty3;\n\tyycon3=c3c4ty3*conz1*ty3;\n\tyycon4=c3c4ty3*con16*ty3;\n\tyycon5=c3c4ty3*c1c5*ty3;\n\tzzcon1=c3c4tz3*con43*tz3;\n\tzzcon2=c3c4tz3*tz3;\n\tzzcon3=c3c4tz3*conz1*tz3;\n\tzzcon4=c3c4tz3*con16*tz3;\n\tzzcon5=c3c4tz3*c1c5*tz3;\n}\n\n/*\n * ---------------------------------------------------------------------\n * verification routine                         \n * ---------------------------------------------------------------------\n */\nvoid verify(int no_time_steps, char* class_npb, boolean* verified){\n\tdouble xcrref[5], xceref[5], xcrdif[5], xcedif[5]; \n\tdouble epsilon, xce[5], xcr[5], dtref=0.0;\n\tint m;\n\t/*\n\t * ---------------------------------------------------------------------\n\t * tolerance level\n\t * ---------------------------------------------------------------------\n\t */  \n\tepsilon=1.0e-08;\n\t/*\n\t * ---------------------------------------------------------------------\n\t * compute the error norm and the residual norm, and exit if not printing\n\t * ---------------------------------------------------------------------\n\t */  \n\terror_norm(xce);\n\tcompute_rhs();\n\trhs_norm(xcr);\n\tfor(m=0; m<5; m++){\n\t\txcr[m]=xcr[m]/dt;\n\t}\n\t*class_npb='U';\n\t*verified=TRUE;\n\tfor(m=0; m<5; m++){\n\t\txcrref[m]=1.0;\n\t\txceref[m]=1.0;\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * reference data for 12X12X12 grids after 60 time steps, with DT = 1.0e-02\n\t * ---------------------------------------------------------------------\n\t */  \n\tif((grid_points[0]==12)&&\n\t\t\t(grid_points[1]==12)&&\n\t\t\t(grid_points[2]==12)&&\n\t\t\t(no_time_steps==60)){\n\t\t*class_npb='S';\n\t\tdtref=1.0e-2;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of residual.\n\t\t * ---------------------------------------------------------------------\n\t\t */       \n\t\txcrref[0]=1.7034283709541311e-01;\n\t\txcrref[1]=1.2975252070034097e-02;\n\t\txcrref[2]=3.2527926989486055e-02;\n\t\txcrref[3]=2.6436421275166801e-02;\n\t\txcrref[4]=1.9211784131744430e-01;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of solution error.\n\t\t * ---------------------------------------------------------------------\n\t\t */    \n\t\txceref[0]=4.9976913345811579e-04;\n\t\txceref[1]=4.5195666782961927e-05;\n\t\txceref[2]=7.3973765172921357e-05;\n\t\txceref[3]=7.3821238632439731e-05;\n\t\txceref[4]=8.9269630987491446e-04;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference data for 24X24X24 grids after 200 time steps, with DT = 0.8d-3\n\t\t * ---------------------------------------------------------------------\n\t\t */  \n\t}else if((grid_points[0]==24)&&\n\t\t\t(grid_points[1]==24)&&\n\t\t\t(grid_points[2]==24)&&\n\t\t\t(no_time_steps==200)){\n\t\t*class_npb='W';\n\t\tdtref = 0.8e-3;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of residual.\n\t\t * ---------------------------------------------------------------------\n\t\t */    \n\t\txcrref[0]=0.1125590409344e+03;\n\t\txcrref[1]=0.1180007595731e+02;\n\t\txcrref[2]=0.2710329767846e+02;\n\t\txcrref[3]=0.2469174937669e+02;\n\t\txcrref[4]=0.2638427874317e+03;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of solution error.\n\t\t * ---------------------------------------------------------------------\n\t\t */    \n\t\txceref[0]=0.4419655736008e+01;\n\t\txceref[1]=0.4638531260002e+00;\n\t\txceref[2]=0.1011551749967e+01;\n\t\txceref[3]=0.9235878729944e+00;\n\t\txceref[4]=0.1018045837718e+02;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference data for 64X64X64 grids after 200 time steps, with DT = 0.8d-3\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t}else if((grid_points[0]==64)&&\n\t\t\t(grid_points[1]==64)&&\n\t\t\t(grid_points[2]==64)&&\n\t\t\t(no_time_steps==200)){\n\t\t*class_npb='A';\n\t\tdtref=0.8e-3;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of residual.\n\t\t * ---------------------------------------------------------------------\n\t\t */    \n\t\txcrref[0]=1.0806346714637264e+02;\n\t\txcrref[1]=1.1319730901220813e+01;\n\t\txcrref[2]=2.5974354511582465e+01;\n\t\txcrref[3]=2.3665622544678910e+01;\n\t\txcrref[4]=2.5278963211748344e+02;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of solution error.\n\t\t * ---------------------------------------------------------------------\n\t\t */    \n\t\txceref[0]=4.2348416040525025e+00;\n\t\txceref[1]=4.4390282496995698e-01;\n\t\txceref[2]=9.6692480136345650e-01;\n\t\txceref[3]=8.8302063039765474e-01;\n\t\txceref[4]=9.7379901770829278e+00;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference data for 102X102X102 grids after 200 time steps,\n\t\t * with DT = 3.0e-04\n\t\t * ---------------------------------------------------------------------\n\t\t */  \n\t}else if((grid_points[0]==102)&&\n\t\t\t(grid_points[1]==102)&&\n\t\t\t(grid_points[2]==102)&&\n\t\t\t(no_time_steps==200)){\n\t\t*class_npb='B';\n\t\tdtref=3.0e-4;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of residual.\n\t\t * ---------------------------------------------------------------------\n\t\t */    \n\t\txcrref[0]=1.4233597229287254e+03;\n\t\txcrref[1]=9.9330522590150238e+01;\n\t\txcrref[2]=3.5646025644535285e+02;\n\t\txcrref[3]=3.2485447959084092e+02;\n\t\txcrref[4]=3.2707541254659363e+03;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of solution error.\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txceref[0]=5.2969847140936856e+01;\n\t\txceref[1]=4.4632896115670668e+00;\n\t\txceref[2]=1.3122573342210174e+01;\n\t\txceref[3]=1.2006925323559144e+01;\n\t\txceref[4]=1.2459576151035986e+02;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference data for 162X162X162 grids after 200 time steps,\n\t\t * with DT = 1.0e-04\n\t\t * ---------------------------------------------------------------------\n\t\t */  \n\t}else if((grid_points[0]==162)&&\n\t\t\t(grid_points[1]==162)&&\n\t\t\t(grid_points[2]==162)&&\n\t\t\t(no_time_steps==200)){\n\t\t*class_npb='C';\n\t\tdtref=1.0e-4;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of residual.\n\t\t * ---------------------------------------------------------------------\n\t\t */    \n\t\txcrref[0]=0.62398116551764615e+04;\n\t\txcrref[1]=0.50793239190423964e+03;\n\t\txcrref[2]=0.15423530093013596e+04;\n\t\txcrref[3]=0.13302387929291190e+04;\n\t\txcrref[4]=0.11604087428436455e+05;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of solution error.\n\t\t * ---------------------------------------------------------------------\n\t\t */    \n\t\txceref[0]=0.16462008369091265e+03;\n\t\txceref[1]=0.11497107903824313e+02;\n\t\txceref[2]=0.41207446207461508e+02;\n\t\txceref[3]=0.37087651059694167e+02;\n\t\txceref[4]=0.36211053051841265e+03;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference data for 408x408x408 grids after 250 time steps,\n\t\t * with DT = 0.2e-04\n\t\t * ---------------------------------------------------------------------\n\t\t */ \n\t}else if((grid_points[0]==408)&&\n\t\t\t(grid_points[1]==408)&&\n\t\t\t(grid_points[2]==408)&&\n\t\t\t(no_time_steps==250)){\n\t\t*class_npb='D';\n\t\tdtref=0.2e-4;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of residual.\n\t\t * ---------------------------------------------------------------------\n\t\t */    \n\t\txcrref[0]=0.2533188551738e+05;\n\t\txcrref[1]=0.2346393716980e+04;\n\t\txcrref[2]=0.6294554366904e+04;\n\t\txcrref[3]=0.5352565376030e+04;\n\t\txcrref[4]=0.3905864038618e+05;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of solution error.\n\t\t * ---------------------------------------------------------------------\n\t\t */    \n\t\txceref[0]=0.3100009377557e+03;\n\t\txceref[1]=0.2424086324913e+02;\n\t\txceref[2]=0.7782212022645e+02;\n\t\txceref[3]=0.6835623860116e+02;\n\t\txceref[4]=0.6065737200368e+03;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference data for 1020x1020x1020 grids after 250 time steps,\n\t\t * with DT = 0.4e-05\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t}else if((grid_points[0]==1020)&&\n\t\t\t(grid_points[1]==1020)&&\n\t\t\t(grid_points[2]==1020)&&\n\t\t\t(no_time_steps==250)){\n\t\t*class_npb='E';\n\t\tdtref=0.4e-5;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of residual.\n\t\t * ---------------------------------------------------------------------\n\t\t */    \n\t\txcrref[0]=0.9795372484517e+05;\n\t\txcrref[1]=0.9739814511521e+04;\n\t\txcrref[2]=0.2467606342965e+05;\n\t\txcrref[3]=0.2092419572860e+05;\n\t\txcrref[4]=0.1392138856939e+06;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of solution error.\n\t\t * ---------------------------------------------------------------------\n\t\t */    \n\t\txceref[0]=0.4327562208414e+03;\n\t\txceref[1]=0.3699051964887e+02;\n\t\txceref[2]=0.1089845040954e+03;\n\t\txceref[3]=0.9462517622043e+02;\n\t\txceref[4]=0.7765512765309e+03;\n\t}else{\n\t\t*verified=FALSE;\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * verification test for residuals if gridsize is one of \n\t * the defined grid sizes above (*class_npb != 'U')\n\t * ---------------------------------------------------------------------\n\t * compute the difference of solution values and the known reference values.\n\t * ---------------------------------------------------------------------\n\t */  \n\tfor(m=0; m<5; m++){\n\t\txcrdif[m]=fabs((xcr[m]-xcrref[m])/xcrref[m]);\n\t\txcedif[m]=fabs((xce[m]-xceref[m])/xceref[m]);\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * output the comparison of computed results to known cases.\n\t * ---------------------------------------------------------------------\n\t */ \n\tif(*class_npb!='U'){\n\t\tprintf(\" Verification being performed for class_npb %c\\n\",*class_npb);\n\t\tprintf(\" accuracy setting for epsilon = %20.13E\\n\",epsilon);\n\t\t*verified=(fabs(dt-dtref)<=epsilon);\n\t\tif(!(*verified)){  \n\t\t\t*class_npb='U';\n\t\t\tprintf(\" DT does not match the reference value of %15.8E\\n\",dtref);\n\t\t}\n\t}else{ \n\t\tprintf(\" Unknown class_npb\\n\");\n\t}\n\tif(*class_npb!='U'){\n\t\tprintf(\" Comparison of RMS-norms of residual\\n\");\n\t}else{\n\t\tprintf(\" RMS-norms of residual\\n\");\n\t}\n\tfor(m=0; m<5; m++){\n\t\tif(*class_npb=='U'){\n\t\t\tprintf(\"          %2d%20.13E\\n\",m+1,xcr[m]);\n\t\t}else if(xcrdif[m]<=epsilon){\n\t\t\tprintf(\"          %2d%20.13E%20.13E%20.13E\\n\",m+1,xcr[m],xcrref[m],xcrdif[m]);\n\t\t}else{ \n\t\t\t*verified=FALSE;\n\t\t\tprintf(\" FAILURE: %2d%20.13E%20.13E%20.13E\\n\",m+1,xcr[m],xcrref[m],xcrdif[m]);\n\t\t}\n\t}\n\tif(*class_npb!='U'){\n\t\tprintf(\" Comparison of RMS-norms of solution error\\n\");\n\t}else{\n\t\tprintf(\" RMS-norms of solution error\\n\");\n\t}\n\tfor(m=0; m<5; m++){\n\t\tif(*class_npb=='U'){\n\t\t\tprintf(\"          %2d%20.13E\\n\",m+1,xce[m]);\n\t\t}else if(xcedif[m]<=epsilon){\n\t\t\tprintf(\"          %2d%20.13E%20.13E%20.13E\\n\",m+1,xce[m],xceref[m],xcedif[m]);\n\t\t}else{\n\t\t\t*verified=FALSE;\n\t\t\tprintf(\" FAILURE: %2d%20.13E%20.13E%20.13E\\n\",m+1,xce[m],xceref[m],xcedif[m]);\n\t\t}\n\t}\n\tif(*class_npb=='U'){\n\t\tprintf(\" No reference values provided\\n\");\n\t\tprintf(\" No verification performed\\n\");\n\t}else if(*verified){\n\t\tprintf(\" Verification Successful\\n\");\n\t}else{\n\t\tprintf(\" Verification failed\\n\");\n\t}\n}\n\n/*\n * ---------------------------------------------------------------------\n * performs line solves in X direction by first factoring\n * the block-tridiagonal matrix into an upper triangular matrix, \n * and then performing back substitution to solve for the unknow\n * vectors of each line.  \n * \n * make sure we treat elements zero to cell_size in the direction\n * of the sweep. \n * ---------------------------------------------------------------------\n */\nvoid x_solve(){\n\tint i, j, k, m, n, isize;\n\tif(timeron){timer_start(T_XSOLVE);}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * this function computes the left hand side in the xi-direction\n\t * ---------------------------------------------------------------------\n\t */\n\tisize=grid_points[0]-1;\n\t/*\n\t * ---------------------------------------------------------------------\n\t * determine a (labeled f) and n jacobians\n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\tfor(i=0; i<=isize; i++){\n\t\t\t\ttmp1=rho_i[k][j][i];\n\t\t\t\ttmp2=tmp1*tmp1;\n\t\t\t\ttmp3=tmp1*tmp2;\n\t\t\t\tfjac[i][0][0]=0.0;\n\t\t\t\tfjac[i][1][0]=1.0;\n\t\t\t\tfjac[i][2][0]=0.0;\n\t\t\t\tfjac[i][3][0]=0.0;\n\t\t\t\tfjac[i][4][0]=0.0;\n\t\t\t\tfjac[i][0][1]=-(u[k][j][i][1]*tmp2*u[k][j][i][1])+c2*qs[k][j][i];\n\t\t\t\tfjac[i][1][1]=(2.0-c2)*(u[k][j][i][1]/u[k][j][i][0]);\n\t\t\t\tfjac[i][2][1]=-c2*(u[k][j][i][2]*tmp1);\n\t\t\t\tfjac[i][3][1]=-c2*(u[k][j][i][3]*tmp1);\n\t\t\t\tfjac[i][4][1]=c2;\n\t\t\t\tfjac[i][0][2]=-(u[k][j][i][1]*u[k][j][i][2])*tmp2;\n\t\t\t\tfjac[i][1][2]=u[k][j][i][2]*tmp1;\n\t\t\t\tfjac[i][2][2]=u[k][j][i][1]*tmp1;\n\t\t\t\tfjac[i][3][2]=0.0;\n\t\t\t\tfjac[i][4][2]=0.0;\n\t\t\t\tfjac[i][0][3]=-(u[k][j][i][1]*u[k][j][i][3])*tmp2;\n\t\t\t\tfjac[i][1][3]=u[k][j][i][3]*tmp1;\n\t\t\t\tfjac[i][2][3]=0.0;\n\t\t\t\tfjac[i][3][3]=u[k][j][i][1]*tmp1;\n\t\t\t\tfjac[i][4][3]=0.0;\n\t\t\t\tfjac[i][0][4]=(c2*2.0*square[k][j][i]-c1*u[k][j][i][4])*(u[k][j][i][1]*tmp2);\n\t\t\t\tfjac[i][1][4]=c1*u[k][j][i][4]*tmp1-c2*(u[k][j][i][1]*u[k][j][i][1]*tmp2+qs[k][j][i]);\n\t\t\t\tfjac[i][2][4]=-c2*(u[k][j][i][2]*u[k][j][i][1])*tmp2;\n\t\t\t\tfjac[i][3][4]=-c2*(u[k][j][i][3]*u[k][j][i][1])*tmp2;\n\t\t\t\tfjac[i][4][4]=c1*(u[k][j][i][1]*tmp1);\n\t\t\t\tnjac[i][0][0]=0.0;\n\t\t\t\tnjac[i][1][0]=0.0;\n\t\t\t\tnjac[i][2][0]=0.0;\n\t\t\t\tnjac[i][3][0]=0.0;\n\t\t\t\tnjac[i][4][0]=0.0;\n\t\t\t\tnjac[i][0][1]=-con43*c3c4*tmp2*u[k][j][i][1];\n\t\t\t\tnjac[i][1][1]=con43*c3c4*tmp1;\n\t\t\t\tnjac[i][2][1]=0.0;\n\t\t\t\tnjac[i][3][1]=0.0;\n\t\t\t\tnjac[i][4][1]=0.0;\n\t\t\t\tnjac[i][0][2]=-c3c4*tmp2*u[k][j][i][2];\n\t\t\t\tnjac[i][1][2]=0.0;\n\t\t\t\tnjac[i][2][2]=c3c4*tmp1;\n\t\t\t\tnjac[i][3][2]=0.0;\n\t\t\t\tnjac[i][4][2]=0.0;\n\t\t\t\tnjac[i][0][3]=-c3c4*tmp2*u[k][j][i][3];\n\t\t\t\tnjac[i][1][3]=0.0;\n\t\t\t\tnjac[i][2][3]=0.0;\n\t\t\t\tnjac[i][3][3]=c3c4*tmp1;\n\t\t\t\tnjac[i][4][3]=0.0;\n\t\t\t\tnjac[i][0][4]=-(con43*c3c4-c1345)*tmp3*(u[k][j][i][1]*u[k][j][i][1])\n\t\t\t\t\t-(c3c4-c1345)*tmp3*(u[k][j][i][2]*u[k][j][i][2])\n\t\t\t\t\t-(c3c4-c1345)*tmp3*(u[k][j][i][3]*u[k][j][i][3])\n\t\t\t\t\t-c1345*tmp2*u[k][j][i][4];\n\t\t\t\tnjac[i][1][4]=(con43*c3c4-c1345)*tmp2*u[k][j][i][1];\n\t\t\t\tnjac[i][2][4]=(c3c4-c1345)*tmp2*u[k][j][i][2];\n\t\t\t\tnjac[i][3][4]=(c3c4-c1345)*tmp2*u[k][j][i][3];\n\t\t\t\tnjac[i][4][4]=(c1345)*tmp1;\n\t\t\t}\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * now jacobians set, so form left hand side in x direction\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tlhsinit(lhs, isize);\n\t\t\tfor(i=1; i<=isize-1; i++){\n\t\t\t\ttmp1=dt*tx1;\n\t\t\t\ttmp2=dt*tx2;\n\t\t\t\tlhs[i][AA][0][0]=-tmp2*fjac[i-1][0][0]\n\t\t\t\t\t-tmp1*njac[i-1][0][0]\n\t\t\t\t\t-tmp1*dx1; \n\t\t\t\tlhs[i][AA][1][0]=-tmp2*fjac[i-1][1][0]\n\t\t\t\t\t-tmp1*njac[i-1][1][0];\n\t\t\t\tlhs[i][AA][2][0]=-tmp2*fjac[i-1][2][0]\n\t\t\t\t\t-tmp1*njac[i-1][2][0];\n\t\t\t\tlhs[i][AA][3][0]=-tmp2*fjac[i-1][3][0]\n\t\t\t\t\t-tmp1*njac[i-1][3][0];\n\t\t\t\tlhs[i][AA][4][0]=-tmp2*fjac[i-1][4][0]\n\t\t\t\t\t-tmp1*njac[i-1][4][0];\n\t\t\t\tlhs[i][AA][0][1]=-tmp2*fjac[i-1][0][1]\n\t\t\t\t\t-tmp1*njac[i-1][0][1];\n\t\t\t\tlhs[i][AA][1][1]=-tmp2*fjac[i-1][1][1]\n\t\t\t\t\t-tmp1*njac[i-1][1][1]\n\t\t\t\t\t-tmp1*dx2;\n\t\t\t\tlhs[i][AA][2][1]=-tmp2*fjac[i-1][2][1]\n\t\t\t\t\t-tmp1*njac[i-1][2][1];\n\t\t\t\tlhs[i][AA][3][1]=-tmp2*fjac[i-1][3][1]\n\t\t\t\t\t-tmp1*njac[i-1][3][1];\n\t\t\t\tlhs[i][AA][4][1]=-tmp2*fjac[i-1][4][1]\n\t\t\t\t\t-tmp1*njac[i-1][4][1];\n\t\t\t\tlhs[i][AA][0][2]=-tmp2*fjac[i-1][0][2]\n\t\t\t\t\t-tmp1*njac[i-1][0][2];\n\t\t\t\tlhs[i][AA][1][2]=-tmp2*fjac[i-1][1][2]\n\t\t\t\t\t-tmp1*njac[i-1][1][2];\n\t\t\t\tlhs[i][AA][2][2]=-tmp2*fjac[i-1][2][2]\n\t\t\t\t\t-tmp1*njac[i-1][2][2]\n\t\t\t\t\t-tmp1*dx3;\n\t\t\t\tlhs[i][AA][3][2]=-tmp2*fjac[i-1][3][2]\n\t\t\t\t\t-tmp1*njac[i-1][3][2];\n\t\t\t\tlhs[i][AA][4][2]=-tmp2*fjac[i-1][4][2]\n\t\t\t\t\t-tmp1*njac[i-1][4][2];\n\t\t\t\tlhs[i][AA][0][3]=-tmp2*fjac[i-1][0][3]\n\t\t\t\t\t-tmp1*njac[i-1][0][3];\n\t\t\t\tlhs[i][AA][1][3]=-tmp2*fjac[i-1][1][3]\n\t\t\t\t\t-tmp1*njac[i-1][1][3];\n\t\t\t\tlhs[i][AA][2][3]=-tmp2*fjac[i-1][2][3]\n\t\t\t\t\t-tmp1*njac[i-1][2][3];\n\t\t\t\tlhs[i][AA][3][3]=-tmp2*fjac[i-1][3][3]\n\t\t\t\t\t-tmp1*njac[i-1][3][3]\n\t\t\t\t\t-tmp1*dx4;\n\t\t\t\tlhs[i][AA][4][3]=-tmp2*fjac[i-1][4][3]\n\t\t\t\t\t-tmp1*njac[i-1][4][3];\n\t\t\t\tlhs[i][AA][0][4]=-tmp2*fjac[i-1][0][4]\n\t\t\t\t\t-tmp1*njac[i-1][0][4];\n\t\t\t\tlhs[i][AA][1][4]=-tmp2*fjac[i-1][1][4]\n\t\t\t\t\t-tmp1*njac[i-1][1][4];\n\t\t\t\tlhs[i][AA][2][4]=-tmp2*fjac[i-1][2][4]\n\t\t\t\t\t-tmp1*njac[i-1][2][4];\n\t\t\t\tlhs[i][AA][3][4]=-tmp2*fjac[i-1][3][4]\n\t\t\t\t\t-tmp1*njac[i-1][3][4];\n\t\t\t\tlhs[i][AA][4][4]=-tmp2*fjac[i-1][4][4]\n\t\t\t\t\t-tmp1*njac[i-1][4][4]\n\t\t\t\t\t-tmp1*dx5;\n\t\t\t\tlhs[i][BB][0][0]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[i][0][0]\n\t\t\t\t\t+tmp1*2.0*dx1;\n\t\t\t\tlhs[i][BB][1][0]=tmp1*2.0*njac[i][1][0];\n\t\t\t\tlhs[i][BB][2][0]=tmp1*2.0*njac[i][2][0];\n\t\t\t\tlhs[i][BB][3][0]=tmp1*2.0*njac[i][3][0];\n\t\t\t\tlhs[i][BB][4][0]=tmp1*2.0*njac[i][4][0];\n\t\t\t\tlhs[i][BB][0][1]=tmp1*2.0*njac[i][0][1];\n\t\t\t\tlhs[i][BB][1][1]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[i][1][1]\n\t\t\t\t\t+tmp1*2.0*dx2;\n\t\t\t\tlhs[i][BB][2][1]=tmp1*2.0*njac[i][2][1];\n\t\t\t\tlhs[i][BB][3][1]=tmp1*2.0*njac[i][3][1];\n\t\t\t\tlhs[i][BB][4][1]=tmp1*2.0*njac[i][4][1];\n\t\t\t\tlhs[i][BB][0][2]=tmp1*2.0*njac[i][0][2];\n\t\t\t\tlhs[i][BB][1][2]=tmp1*2.0*njac[i][1][2];\n\t\t\t\tlhs[i][BB][2][2]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[i][2][2]\n\t\t\t\t\t+tmp1*2.0*dx3;\n\t\t\t\tlhs[i][BB][3][2]=tmp1*2.0*njac[i][3][2];\n\t\t\t\tlhs[i][BB][4][2]=tmp1*2.0*njac[i][4][2];\n\t\t\t\tlhs[i][BB][0][3]=tmp1*2.0*njac[i][0][3];\n\t\t\t\tlhs[i][BB][1][3]=tmp1*2.0*njac[i][1][3];\n\t\t\t\tlhs[i][BB][2][3]=tmp1*2.0*njac[i][2][3];\n\t\t\t\tlhs[i][BB][3][3]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[i][3][3]\n\t\t\t\t\t+tmp1*2.0*dx4;\n\t\t\t\tlhs[i][BB][4][3]=tmp1*2.0*njac[i][4][3];\n\t\t\t\tlhs[i][BB][0][4]=tmp1*2.0*njac[i][0][4];\n\t\t\t\tlhs[i][BB][1][4]=tmp1*2.0*njac[i][1][4];\n\t\t\t\tlhs[i][BB][2][4]=tmp1*2.0*njac[i][2][4];\n\t\t\t\tlhs[i][BB][3][4]=tmp1*2.0*njac[i][3][4];\n\t\t\t\tlhs[i][BB][4][4]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[i][4][4]\n\t\t\t\t\t+tmp1*2.0*dx5;\n\t\t\t\tlhs[i][CC][0][0]=tmp2*fjac[i+1][0][0]\n\t\t\t\t\t-tmp1*njac[i+1][0][0]\n\t\t\t\t\t-tmp1*dx1;\n\t\t\t\tlhs[i][CC][1][0]=tmp2*fjac[i+1][1][0]\n\t\t\t\t\t-tmp1*njac[i+1][1][0];\n\t\t\t\tlhs[i][CC][2][0]=tmp2*fjac[i+1][2][0]\n\t\t\t\t\t-tmp1*njac[i+1][2][0];\n\t\t\t\tlhs[i][CC][3][0]=tmp2*fjac[i+1][3][0]\n\t\t\t\t\t-tmp1*njac[i+1][3][0];\n\t\t\t\tlhs[i][CC][4][0]=tmp2*fjac[i+1][4][0]\n\t\t\t\t\t-tmp1*njac[i+1][4][0];\n\t\t\t\tlhs[i][CC][0][1]=tmp2*fjac[i+1][0][1]\n\t\t\t\t\t-tmp1*njac[i+1][0][1];\n\t\t\t\tlhs[i][CC][1][1]=tmp2*fjac[i+1][1][1]\n\t\t\t\t\t-tmp1*njac[i+1][1][1]\n\t\t\t\t\t-tmp1*dx2;\n\t\t\t\tlhs[i][CC][2][1]=tmp2*fjac[i+1][2][1]\n\t\t\t\t\t-tmp1*njac[i+1][2][1];\n\t\t\t\tlhs[i][CC][3][1]=tmp2*fjac[i+1][3][1]\n\t\t\t\t\t-tmp1*njac[i+1][3][1];\n\t\t\t\tlhs[i][CC][4][1]=tmp2*fjac[i+1][4][1]\n\t\t\t\t\t-tmp1*njac[i+1][4][1];\n\t\t\t\tlhs[i][CC][0][2]=tmp2*fjac[i+1][0][2]\n\t\t\t\t\t-tmp1*njac[i+1][0][2];\n\t\t\t\tlhs[i][CC][1][2]=tmp2*fjac[i+1][1][2]\n\t\t\t\t\t-tmp1*njac[i+1][1][2];\n\t\t\t\tlhs[i][CC][2][2]=tmp2*fjac[i+1][2][2]\n\t\t\t\t\t-tmp1*njac[i+1][2][2]\n\t\t\t\t\t-tmp1*dx3;\n\t\t\t\tlhs[i][CC][3][2]=tmp2*fjac[i+1][3][2]\n\t\t\t\t\t-tmp1*njac[i+1][3][2];\n\t\t\t\tlhs[i][CC][4][2]=tmp2*fjac[i+1][4][2]\n\t\t\t\t\t-tmp1*njac[i+1][4][2];\n\t\t\t\tlhs[i][CC][0][3]=tmp2*fjac[i+1][0][3]\n\t\t\t\t\t-tmp1*njac[i+1][0][3];\n\t\t\t\tlhs[i][CC][1][3]=tmp2*fjac[i+1][1][3]\n\t\t\t\t\t-tmp1*njac[i+1][1][3];\n\t\t\t\tlhs[i][CC][2][3]=tmp2*fjac[i+1][2][3]\n\t\t\t\t\t-tmp1*njac[i+1][2][3];\n\t\t\t\tlhs[i][CC][3][3]=tmp2*fjac[i+1][3][3]\n\t\t\t\t\t-tmp1*njac[i+1][3][3]\n\t\t\t\t\t-tmp1*dx4;\n\t\t\t\tlhs[i][CC][4][3]=tmp2*fjac[i+1][4][3]\n\t\t\t\t\t-tmp1*njac[i+1][4][3];\n\t\t\t\tlhs[i][CC][0][4]=tmp2*fjac[i+1][0][4]\n\t\t\t\t\t-tmp1*njac[i+1][0][4];\n\t\t\t\tlhs[i][CC][1][4]=tmp2*fjac[i+1][1][4]\n\t\t\t\t\t-tmp1*njac[i+1][1][4];\n\t\t\t\tlhs[i][CC][2][4]=tmp2*fjac[i+1][2][4]\n\t\t\t\t\t-tmp1*njac[i+1][2][4];\n\t\t\t\tlhs[i][CC][3][4]=tmp2 * fjac[i+1][3][4]\n\t\t\t\t\t-tmp1*njac[i+1][3][4];\n\t\t\t\tlhs[i][CC][4][4]=tmp2*fjac[i+1][4][4]\n\t\t\t\t\t-tmp1*njac[i+1][4][4]\n\t\t\t\t\t-tmp1*dx5;\n\t\t\t}\t\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * performs guaussian elimination on this cell.\n\t\t\t * \n\t\t\t * assumes that unpacking routines for non-first cells \n\t\t\t * preload C' and rhs' from previous cell.\n\t\t\t * \n\t\t\t * assumed send happens outside this routine, but that\n\t\t\t * c'(IMAX) and rhs'(IMAX) will be sent to next cell\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * outer most do loops - sweeping in i direction\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * multiply c(0,j,k) by b_inverse and copy back to c\n\t\t\t * multiply rhs(0) by b_inverse(0) and copy to rhs\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tbinvcrhs(lhs[0][BB], lhs[0][CC], rhs[k][j][0]);\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * begin inner most do loop\n\t\t\t * do all the elements of the cell unless last \n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfor(i=1; i<=isize-1; i++){\n\t\t\t\t/*\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t * rhs(i) = rhs(i) - A*rhs(i-1)\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t */\n\t\t\t\tmatvec_sub(lhs[i][AA], rhs[k][j][i-1], rhs[k][j][i]);\n\t\t\t\t/*\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t * B(i) = B(i) - C(i-1)*A(i)\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t */\n\t\t\t\tmatmul_sub(lhs[i][AA], lhs[i-1][CC], lhs[i][BB]);\n\t\t\t\t/*\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t * multiply c(i,j,k) by b_inverse and copy back to c\n\t\t\t\t * multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t */\n\t\t\t\tbinvcrhs(lhs[i][BB], lhs[i][CC], rhs[k][j][i]);\n\t\t\t}\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * rhs(isize) = rhs(isize) - A*rhs(isize-1)\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tmatvec_sub(lhs[isize][AA], rhs[k][j][isize-1], rhs[k][j][isize]);\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * B(isize) = B(isize) - C(isize-1)*A(isize)\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tmatmul_sub(lhs[isize][AA], lhs[isize-1][CC], lhs[isize][BB]);\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * multiply rhs() by b_inverse() and copy to rhs\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tbinvrhs(lhs[isize][BB], rhs[k][j][isize]);\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * back solve: if last cell, then generate U(isize)=rhs(isize)\n\t\t\t * else assume U(isize) is loaded in un pack backsub_info\n\t\t\t * so just use it\n\t\t\t * after u(istart) will be sent to next cell\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfor(i=isize-1; i>=0; i--){\n\t\t\t\tfor(m=0; m<BLOCK_SIZE; m++){\n\t\t\t\t\tfor(n=0; n<BLOCK_SIZE; n++){\n\t\t\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-lhs[i][CC][n][m]*rhs[k][j][i+1][n];\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_XSOLVE);}\n}\n\n/*\n * ---------------------------------------------------------------------\n * performs line solves in y direction by first factoring\n * the block-tridiagonal matrix into an upper triangular matrix, \n * and then performing back substitution to solve for the unknow\n * vectors of each line.  \n *  \n * make sure we treat elements zero to cell_size in the direction\n * of the sweep.\n * ---------------------------------------------------------------------\n */\nvoid y_solve(){\n\tint i, j, k, m, n, jsize;\n\tif(timeron){timer_start(T_YSOLVE);}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * this function computes the left hand side for the three y-factors   \n\t * ---------------------------------------------------------------------\n\t */\n\tjsize=grid_points[1]-1;\n\t/*\n\t * ---------------------------------------------------------------------\n\t * compute the indices for storing the tri-diagonal matrix;\n\t * determine a (labeled f) and n jacobians for cell c\n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tfor(j=0; j<=jsize; j++){\n\t\t\t\ttmp1=rho_i[k][j][i];\n\t\t\t\ttmp2=tmp1*tmp1;\n\t\t\t\ttmp3=tmp1*tmp2;\n\t\t\t\tfjac[j][0][0]=0.0;\n\t\t\t\tfjac[j][1][0]=0.0;\n\t\t\t\tfjac[j][2][0]=1.0;\n\t\t\t\tfjac[j][3][0]=0.0;\n\t\t\t\tfjac[j][4][0]=0.0;\n\t\t\t\tfjac[j][0][1]=-(u[k][j][i][1]*u[k][j][i][2])*tmp2;\n\t\t\t\tfjac[j][1][1]=u[k][j][i][2]*tmp1;\n\t\t\t\tfjac[j][2][1]=u[k][j][i][1]*tmp1;\n\t\t\t\tfjac[j][3][1]=0.0;\n\t\t\t\tfjac[j][4][1]=0.0;\n\t\t\t\tfjac[j][0][2]=-(u[k][j][i][2]*u[k][j][i][2]*tmp2)+c2*qs[k][j][i];\n\t\t\t\tfjac[j][1][2]=-c2*u[k][j][i][1]*tmp1;\n\t\t\t\tfjac[j][2][2]=(2.0-c2)*u[k][j][i][2]*tmp1;\n\t\t\t\tfjac[j][3][2]=-c2*u[k][j][i][3]*tmp1;\n\t\t\t\tfjac[j][4][2]=c2;\n\t\t\t\tfjac[j][0][3]=-(u[k][j][i][2]*u[k][j][i][3])*tmp2;\n\t\t\t\tfjac[j][1][3]=0.0;\n\t\t\t\tfjac[j][2][3]=u[k][j][i][3]*tmp1;\n\t\t\t\tfjac[j][3][3]=u[k][j][i][2]*tmp1;\n\t\t\t\tfjac[j][4][3]=0.0;\n\t\t\t\tfjac[j][0][4]=(c2*2.0*square[k][j][i]-c1*u[k][j][i][4])*u[k][j][i][2]*tmp2;\n\t\t\t\tfjac[j][1][4]=-c2*u[k][j][i][1]*u[k][j][i][2]*tmp2;\n\t\t\t\tfjac[j][2][4]=c1*u[k][j][i][4]*tmp1-c2*(qs[k][j][i]+u[k][j][i][2]*u[k][j][i][2]*tmp2);\n\t\t\t\tfjac[j][3][4]=-c2*(u[k][j][i][2]*u[k][j][i][3])*tmp2;\n\t\t\t\tfjac[j][4][4]=c1*u[k][j][i][2]*tmp1;\n\t\t\t\tnjac[j][0][0]=0.0;\n\t\t\t\tnjac[j][1][0]=0.0;\n\t\t\t\tnjac[j][2][0]=0.0;\n\t\t\t\tnjac[j][3][0]=0.0;\n\t\t\t\tnjac[j][4][0]=0.0;\n\t\t\t\tnjac[j][0][1]=-c3c4*tmp2*u[k][j][i][1];\n\t\t\t\tnjac[j][1][1]=c3c4*tmp1;\n\t\t\t\tnjac[j][2][1]=0.0;\n\t\t\t\tnjac[j][3][1]=0.0;\n\t\t\t\tnjac[j][4][1]=0.0;\n\t\t\t\tnjac[j][0][2]=-con43*c3c4*tmp2*u[k][j][i][2];\n\t\t\t\tnjac[j][1][2]=0.0;\n\t\t\t\tnjac[j][2][2]=con43*c3c4*tmp1;\n\t\t\t\tnjac[j][3][2]=0.0;\n\t\t\t\tnjac[j][4][2]=0.0;\n\t\t\t\tnjac[j][0][3]=-c3c4*tmp2*u[k][j][i][3];\n\t\t\t\tnjac[j][1][3]=0.0;\n\t\t\t\tnjac[j][2][3]=0.0;\n\t\t\t\tnjac[j][3][3]=c3c4*tmp1;\n\t\t\t\tnjac[j][4][3]=0.0;\n\t\t\t\tnjac[j][0][4]=-(c3c4-c1345)*tmp3*(u[k][j][i][1]*u[k][j][i][1])\n\t\t\t\t\t-(con43*c3c4-c1345)*tmp3*(u[k][j][i][2]*u[k][j][i][2])\n\t\t\t\t\t-(c3c4-c1345)*tmp3*(u[k][j][i][3]*u[k][j][i][3])\n\t\t\t\t\t-c1345*tmp2*u[k][j][i][4];\n\t\t\t\tnjac[j][1][4]=(c3c4-c1345)*tmp2*u[k][j][i][1];\n\t\t\t\tnjac[j][2][4]=(con43*c3c4-c1345)*tmp2*u[k][j][i][2];\n\t\t\t\tnjac[j][3][4]=(c3c4-c1345)*tmp2*u[k][j][i][3];\n\t\t\t\tnjac[j][4][4]=(c1345)*tmp1;\n\t\t\t}\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * now joacobians set, so form left hand side in y direction\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tlhsinit(lhs, jsize);\n\t\t\tfor(j=1; j<=jsize-1; j++){\n\t\t\t\ttmp1=dt*ty1;\n\t\t\t\ttmp2=dt*ty2;\n\t\t\t\tlhs[j][AA][0][0]=-tmp2*fjac[j-1][0][0]\n\t\t\t\t\t-tmp1*njac[j-1][0][0]\n\t\t\t\t\t-tmp1*dy1; \n\t\t\t\tlhs[j][AA][1][0]=-tmp2*fjac[j-1][1][0]\n\t\t\t\t\t-tmp1*njac[j-1][1][0];\n\t\t\t\tlhs[j][AA][2][0]=-tmp2*fjac[j-1][2][0]\n\t\t\t\t\t-tmp1*njac[j-1][2][0];\n\t\t\t\tlhs[j][AA][3][0]=-tmp2*fjac[j-1][3][0]\n\t\t\t\t\t-tmp1*njac[j-1][3][0];\n\t\t\t\tlhs[j][AA][4][0]=-tmp2*fjac[j-1][4][0]\n\t\t\t\t\t-tmp1*njac[j-1][4][0];\n\t\t\t\tlhs[j][AA][0][1]=-tmp2*fjac[j-1][0][1]\n\t\t\t\t\t-tmp1*njac[j-1][0][1];\n\t\t\t\tlhs[j][AA][1][1]=-tmp2*fjac[j-1][1][1]\n\t\t\t\t\t-tmp1*njac[j-1][1][1]\n\t\t\t\t\t-tmp1*dy2;\n\t\t\t\tlhs[j][AA][2][1]=-tmp2*fjac[j-1][2][1]\n\t\t\t\t\t-tmp1*njac[j-1][2][1];\n\t\t\t\tlhs[j][AA][3][1]=-tmp2*fjac[j-1][3][1]\n\t\t\t\t\t-tmp1*njac[j-1][3][1];\n\t\t\t\tlhs[j][AA][4][1]=-tmp2*fjac[j-1][4][1]\n\t\t\t\t\t-tmp1*njac[j-1][4][1];\n\t\t\t\tlhs[j][AA][0][2]=-tmp2*fjac[j-1][0][2]\n\t\t\t\t\t-tmp1*njac[j-1][0][2];\n\t\t\t\tlhs[j][AA][1][2]=-tmp2*fjac[j-1][1][2]\n\t\t\t\t\t-tmp1*njac[j-1][1][2];\n\t\t\t\tlhs[j][AA][2][2]=-tmp2*fjac[j-1][2][2]\n\t\t\t\t\t-tmp1*njac[j-1][2][2]\n\t\t\t\t\t-tmp1*dy3;\n\t\t\t\tlhs[j][AA][3][2]=-tmp2*fjac[j-1][3][2]\n\t\t\t\t\t-tmp1*njac[j-1][3][2];\n\t\t\t\tlhs[j][AA][4][2]=-tmp2*fjac[j-1][4][2]\n\t\t\t\t\t-tmp1*njac[j-1][4][2];\n\t\t\t\tlhs[j][AA][0][3]=-tmp2*fjac[j-1][0][3]\n\t\t\t\t\t-tmp1*njac[j-1][0][3];\n\t\t\t\tlhs[j][AA][1][3]=-tmp2*fjac[j-1][1][3]\n\t\t\t\t\t-tmp1*njac[j-1][1][3];\n\t\t\t\tlhs[j][AA][2][3]=-tmp2*fjac[j-1][2][3]\n\t\t\t\t\t-tmp1*njac[j-1][2][3];\n\t\t\t\tlhs[j][AA][3][3]=-tmp2*fjac[j-1][3][3]\n\t\t\t\t\t-tmp1*njac[j-1][3][3]\n\t\t\t\t\t-tmp1*dy4;\n\t\t\t\tlhs[j][AA][4][3]=-tmp2*fjac[j-1][4][3]\n\t\t\t\t\t-tmp1*njac[j-1][4][3];\n\t\t\t\tlhs[j][AA][0][4]=-tmp2*fjac[j-1][0][4]\n\t\t\t\t\t-tmp1*njac[j-1][0][4];\n\t\t\t\tlhs[j][AA][1][4]=-tmp2*fjac[j-1][1][4]\n\t\t\t\t\t-tmp1*njac[j-1][1][4];\n\t\t\t\tlhs[j][AA][2][4]=-tmp2*fjac[j-1][2][4]\n\t\t\t\t\t-tmp1*njac[j-1][2][4];\n\t\t\t\tlhs[j][AA][3][4]=-tmp2*fjac[j-1][3][4]\n\t\t\t\t\t-tmp1*njac[j-1][3][4];\n\t\t\t\tlhs[j][AA][4][4]=-tmp2*fjac[j-1][4][4]\n\t\t\t\t\t-tmp1*njac[j-1][4][4]\n\t\t\t\t\t-tmp1*dy5;\n\t\t\t\tlhs[j][BB][0][0]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[j][0][0]\n\t\t\t\t\t+tmp1*2.0*dy1;\n\t\t\t\tlhs[j][BB][1][0]=tmp1*2.0*njac[j][1][0];\n\t\t\t\tlhs[j][BB][2][0]=tmp1*2.0*njac[j][2][0];\n\t\t\t\tlhs[j][BB][3][0]=tmp1*2.0*njac[j][3][0];\n\t\t\t\tlhs[j][BB][4][0]=tmp1*2.0*njac[j][4][0];\n\t\t\t\tlhs[j][BB][0][1]=tmp1*2.0*njac[j][0][1];\n\t\t\t\tlhs[j][BB][1][1]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[j][1][1]\n\t\t\t\t\t+tmp1*2.0*dy2;\n\t\t\t\tlhs[j][BB][2][1]=tmp1*2.0*njac[j][2][1];\n\t\t\t\tlhs[j][BB][3][1]=tmp1*2.0*njac[j][3][1];\n\t\t\t\tlhs[j][BB][4][1]=tmp1*2.0*njac[j][4][1];\n\t\t\t\tlhs[j][BB][0][2]=tmp1*2.0*njac[j][0][2];\n\t\t\t\tlhs[j][BB][1][2]=tmp1*2.0*njac[j][1][2];\n\t\t\t\tlhs[j][BB][2][2]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[j][2][2]\n\t\t\t\t\t+tmp1*2.0*dy3;\n\t\t\t\tlhs[j][BB][3][2]=tmp1*2.0*njac[j][3][2];\n\t\t\t\tlhs[j][BB][4][2]=tmp1*2.0*njac[j][4][2];\n\t\t\t\tlhs[j][BB][0][3]=tmp1*2.0*njac[j][0][3];\n\t\t\t\tlhs[j][BB][1][3]=tmp1*2.0*njac[j][1][3];\n\t\t\t\tlhs[j][BB][2][3]=tmp1*2.0*njac[j][2][3];\n\t\t\t\tlhs[j][BB][3][3]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[j][3][3]\n\t\t\t\t\t+tmp1*2.0*dy4;\n\t\t\t\tlhs[j][BB][4][3]=tmp1*2.0*njac[j][4][3];\n\t\t\t\tlhs[j][BB][0][4]=tmp1*2.0*njac[j][0][4];\n\t\t\t\tlhs[j][BB][1][4]=tmp1*2.0*njac[j][1][4];\n\t\t\t\tlhs[j][BB][2][4]=tmp1*2.0*njac[j][2][4];\n\t\t\t\tlhs[j][BB][3][4]=tmp1*2.0*njac[j][3][4];\n\t\t\t\tlhs[j][BB][4][4]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[j][4][4] \n\t\t\t\t\t+tmp1*2.0*dy5;\n\t\t\t\tlhs[j][CC][0][0]=tmp2*fjac[j+1][0][0]\n\t\t\t\t\t-tmp1*njac[j+1][0][0]\n\t\t\t\t\t-tmp1*dy1;\n\t\t\t\tlhs[j][CC][1][0]=tmp2*fjac[j+1][1][0]\n\t\t\t\t\t-tmp1*njac[j+1][1][0];\n\t\t\t\tlhs[j][CC][2][0]=tmp2*fjac[j+1][2][0]\n\t\t\t\t\t-tmp1*njac[j+1][2][0];\n\t\t\t\tlhs[j][CC][3][0]=tmp2*fjac[j+1][3][0]\n\t\t\t\t\t-tmp1*njac[j+1][3][0];\n\t\t\t\tlhs[j][CC][4][0]=tmp2*fjac[j+1][4][0]\n\t\t\t\t\t-tmp1*njac[j+1][4][0];\n\t\t\t\tlhs[j][CC][0][1]=tmp2*fjac[j+1][0][1]\n\t\t\t\t\t-tmp1*njac[j+1][0][1];\n\t\t\t\tlhs[j][CC][1][1]=tmp2*fjac[j+1][1][1]\n\t\t\t\t\t-tmp1*njac[j+1][1][1]\n\t\t\t\t\t-tmp1*dy2;\n\t\t\t\tlhs[j][CC][2][1]=tmp2*fjac[j+1][2][1]\n\t\t\t\t\t-tmp1*njac[j+1][2][1];\n\t\t\t\tlhs[j][CC][3][1]=tmp2*fjac[j+1][3][1]\n\t\t\t\t\t-tmp1*njac[j+1][3][1];\n\t\t\t\tlhs[j][CC][4][1]=tmp2*fjac[j+1][4][1]\n\t\t\t\t\t-tmp1*njac[j+1][4][1];\n\t\t\t\tlhs[j][CC][0][2]=tmp2*fjac[j+1][0][2]\n\t\t\t\t\t-tmp1*njac[j+1][0][2];\n\t\t\t\tlhs[j][CC][1][2]=tmp2*fjac[j+1][1][2]\n\t\t\t\t\t-tmp1*njac[j+1][1][2];\n\t\t\t\tlhs[j][CC][2][2]=tmp2*fjac[j+1][2][2]\n\t\t\t\t\t-tmp1*njac[j+1][2][2]\n\t\t\t\t\t-tmp1*dy3;\n\t\t\t\tlhs[j][CC][3][2]=tmp2*fjac[j+1][3][2]\n\t\t\t\t\t-tmp1*njac[j+1][3][2];\n\t\t\t\tlhs[j][CC][4][2]=tmp2*fjac[j+1][4][2]\n\t\t\t\t\t-tmp1*njac[j+1][4][2];\n\t\t\t\tlhs[j][CC][0][3]=tmp2*fjac[j+1][0][3]\n\t\t\t\t\t-tmp1*njac[j+1][0][3];\n\t\t\t\tlhs[j][CC][1][3]=tmp2*fjac[j+1][1][3]\n\t\t\t\t\t-tmp1*njac[j+1][1][3];\n\t\t\t\tlhs[j][CC][2][3]=tmp2*fjac[j+1][2][3]\n\t\t\t\t\t-tmp1*njac[j+1][2][3];\n\t\t\t\tlhs[j][CC][3][3]=tmp2*fjac[j+1][3][3]\n\t\t\t\t\t-tmp1*njac[j+1][3][3]\n\t\t\t\t\t-tmp1*dy4;\n\t\t\t\tlhs[j][CC][4][3]=tmp2*fjac[j+1][4][3]\n\t\t\t\t\t-tmp1*njac[j+1][4][3];\n\t\t\t\tlhs[j][CC][0][4]=tmp2*fjac[j+1][0][4]\n\t\t\t\t\t-tmp1*njac[j+1][0][4];\n\t\t\t\tlhs[j][CC][1][4]=tmp2*fjac[j+1][1][4]\n\t\t\t\t\t-tmp1*njac[j+1][1][4];\n\t\t\t\tlhs[j][CC][2][4]=tmp2*fjac[j+1][2][4]\n\t\t\t\t\t-tmp1*njac[j+1][2][4];\n\t\t\t\tlhs[j][CC][3][4]=tmp2*fjac[j+1][3][4]\n\t\t\t\t\t-tmp1*njac[j+1][3][4];\n\t\t\t\tlhs[j][CC][4][4]=tmp2*fjac[j+1][4][4]\n\t\t\t\t\t-tmp1*njac[j+1][4][4]\n\t\t\t\t\t-tmp1*dy5;\n\t\t\t}\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * performs guaussian elimination on this cell.\n\t\t\t *\n\t\t\t * assumes that unpacking routines for non-first cells \n\t\t\t * preload c' and rhs' from previous cell.\n\t\t\t * \n\t\t\t * assumed send happens outside this routine, but that\n\t\t\t * c'(JMAX) and rhs'(JMAX) will be sent to next cell\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * multiply c(i,0,k) by b_inverse and copy back to c\n\t\t\t * multiply rhs(0) by b_inverse(0) and copy to rhs\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tbinvcrhs(lhs[0][BB], lhs[0][CC], rhs[k][0][i]);\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * begin inner most do loop\n\t\t\t * do all the elements of the cell unless last \n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfor(j=1; j<=jsize-1; j++){\n\t\t\t\t/*\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t * subtract A*lhs_vector(j-1) from lhs_vector(j)\n\t\t\t\t *  \n\t\t\t\t * rhs(j) = rhs(j) - A*rhs(j-1)\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t */\n\t\t\t\tmatvec_sub(lhs[j][AA], rhs[k][j-1][i], rhs[k][j][i]);\n\t\t\t\t/*\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t * B(j) = B(j) - C(j-1)*A(j)\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t */\n\t\t\t\tmatmul_sub(lhs[j][AA], lhs[j-1][CC], lhs[j][BB]);\n\t\t\t\t/*\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t * multiply c(i,j,k) by b_inverse and copy back to c\n\t\t\t\t * multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t */\n\t\t\t\tbinvcrhs(lhs[j][BB], lhs[j][CC], rhs[k][j][i]);\n\t\t\t}\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * rhs(jsize) = rhs(jsize) - A*rhs(jsize-1)\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tmatvec_sub(lhs[jsize][AA], rhs[k][jsize-1][i], rhs[k][jsize][i]);\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * B(jsize) = B(jsize) - C(jsize-1)*A(jsize)\n\t\t\t * matmul_sub(aa,i,jsize,k,c,\n\t\t\t * $ cc,i,jsize-1,k,c,bb,i,jsize,k)\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tmatmul_sub(lhs[jsize][AA], lhs[jsize-1][CC], lhs[jsize][BB]);\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * multiply rhs(jsize) by b_inverse(jsize) and copy to rhs\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tbinvrhs(lhs[jsize][BB], rhs[k][jsize][i]);\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * back solve: if last cell, then generate U(jsize)=rhs(jsize)\n\t\t\t * else assume U(jsize) is loaded in un pack backsub_info\n\t\t\t * so just use it\n\t\t\t * after u(jstart) will be sent to next cell\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfor(j=jsize-1; j>=0; j--){\n\t\t\t\tfor(m=0; m<BLOCK_SIZE; m++){\n\t\t\t\t\tfor(n=0; n<BLOCK_SIZE; n++){\n\t\t\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-lhs[j][CC][n][m]*rhs[k][j+1][i][n];\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_YSOLVE);}\n}\n\n/*\n * ---------------------------------------------------------------------\n * performs line solves in Z direction by first factoring\n * the block-tridiagonal matrix into an upper triangular matrix, \n * and then performing back substitution to solve for the unknow\n * vectors of each line.  \n *  \n * make sure we treat elements zero to cell_size in the direction\n * of the sweep.\n * ---------------------------------------------------------------------\n */\nvoid z_solve(){\n\tint i, j, k, m, n, ksize;\n\tif(timeron){timer_start(T_ZSOLVE);}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * this function computes the left hand side for the three z-factors   \n\t * ---------------------------------------------------------------------\n\t */\n\tksize = grid_points[2]-1;\n\t/*\n\t * ---------------------------------------------------------------------\n\t * compute the indices for storing the block-diagonal matrix;\n\t * determine c (labeled f) and s jacobians\n\t * ---------------------------------------------------------------------\n\t */\n\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tfor(k=0; k<=ksize; k++){\n\t\t\t\ttmp1=1.0/u[k][j][i][0];\n\t\t\t\ttmp2=tmp1*tmp1;\n\t\t\t\ttmp3=tmp1*tmp2;\n\t\t\t\tfjac[k][0][0]=0.0;\n\t\t\t\tfjac[k][1][0]=0.0;\n\t\t\t\tfjac[k][2][0]=0.0;\n\t\t\t\tfjac[k][3][0]=1.0;\n\t\t\t\tfjac[k][4][0]=0.0;\n\t\t\t\tfjac[k][0][1]=-(u[k][j][i][1]*u[k][j][i][3])*tmp2;\n\t\t\t\tfjac[k][1][1]=u[k][j][i][3]*tmp1;\n\t\t\t\tfjac[k][2][1]=0.0;\n\t\t\t\tfjac[k][3][1]=u[k][j][i][1]*tmp1;\n\t\t\t\tfjac[k][4][1]=0.0;\n\t\t\t\tfjac[k][0][2]=-(u[k][j][i][2]*u[k][j][i][3])*tmp2;\n\t\t\t\tfjac[k][1][2]=0.0;\n\t\t\t\tfjac[k][2][2]=u[k][j][i][3]*tmp1;\n\t\t\t\tfjac[k][3][2]=u[k][j][i][2]*tmp1;\n\t\t\t\tfjac[k][4][2]=0.0;\n\t\t\t\tfjac[k][0][3]=-(u[k][j][i][3]*u[k][j][i][3]*tmp2)+c2*qs[k][j][i];\n\t\t\t\tfjac[k][1][3]=-c2*u[k][j][i][1]*tmp1;\n\t\t\t\tfjac[k][2][3]=-c2*u[k][j][i][2]*tmp1;\n\t\t\t\tfjac[k][3][3]=(2.0-c2)*u[k][j][i][3]*tmp1;\n\t\t\t\tfjac[k][4][3]=c2;\n\t\t\t\tfjac[k][0][4]=(c2*2.0*square[k][j][i]-c1*u[k][j][i][4])*u[k][j][i][3]*tmp2;\n\t\t\t\tfjac[k][1][4]=-c2*(u[k][j][i][1]*u[k][j][i][3])*tmp2;\n\t\t\t\tfjac[k][2][4]=-c2*(u[k][j][i][2]*u[k][j][i][3])*tmp2;\n\t\t\t\tfjac[k][3][4]=c1*(u[k][j][i][4]*tmp1)-c2*(qs[k][j][i]+u[k][j][i][3]*u[k][j][i][3]*tmp2);\n\t\t\t\tfjac[k][4][4]=c1*u[k][j][i][3]*tmp1;\n\t\t\t\tnjac[k][0][0]=0.0;\n\t\t\t\tnjac[k][1][0]=0.0;\n\t\t\t\tnjac[k][2][0]=0.0;\n\t\t\t\tnjac[k][3][0]=0.0;\n\t\t\t\tnjac[k][4][0]=0.0;\n\t\t\t\tnjac[k][0][1]=-c3c4*tmp2*u[k][j][i][1];\n\t\t\t\tnjac[k][1][1]=c3c4*tmp1;\n\t\t\t\tnjac[k][2][1]=0.0;\n\t\t\t\tnjac[k][3][1]=0.0;\n\t\t\t\tnjac[k][4][1]=0.0;\n\t\t\t\tnjac[k][0][2]=-c3c4*tmp2*u[k][j][i][2];\n\t\t\t\tnjac[k][1][2]=0.0;\n\t\t\t\tnjac[k][2][2]=c3c4*tmp1;\n\t\t\t\tnjac[k][3][2]=0.0;\n\t\t\t\tnjac[k][4][2]=0.0;\n\t\t\t\tnjac[k][0][3]=-con43*c3c4*tmp2*u[k][j][i][3];\n\t\t\t\tnjac[k][1][3]=0.0;\n\t\t\t\tnjac[k][2][3]=0.0;\n\t\t\t\tnjac[k][3][3]=con43*c3*c4*tmp1;\n\t\t\t\tnjac[k][4][3]=0.0;\n\t\t\t\tnjac[k][0][4]=-(c3c4-c1345)*tmp3*(u[k][j][i][1]*u[k][j][i][1])\n\t\t\t\t\t-(c3c4-c1345)*tmp3*(u[k][j][i][2]*u[k][j][i][2])\n\t\t\t\t\t-(con43*c3c4-c1345)*tmp3*(u[k][j][i][3]*u[k][j][i][3])\n\t\t\t\t\t-c1345*tmp2*u[k][j][i][4];\n\t\t\t\tnjac[k][1][4]=(c3c4-c1345)*tmp2*u[k][j][i][1];\n\t\t\t\tnjac[k][2][4]=(c3c4-c1345)*tmp2*u[k][j][i][2];\n\t\t\t\tnjac[k][3][4]=(con43*c3c4-c1345)*tmp2*u[k][j][i][3];\n\t\t\t\tnjac[k][4][4]=(c1345)*tmp1;\n\t\t\t}\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * now jacobians set, so form left hand side in z direction\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tlhsinit(lhs, ksize);\n\t\t\tfor(k=1; k<=ksize-1; k++){\n\t\t\t\ttmp1=dt*tz1;\n\t\t\t\ttmp2=dt*tz2;\n\t\t\t\tlhs[k][AA][0][0]=-tmp2*fjac[k-1][0][0]\n\t\t\t\t\t-tmp1*njac[k-1][0][0]\n\t\t\t\t\t-tmp1*dz1; \n\t\t\t\tlhs[k][AA][1][0]=-tmp2*fjac[k-1][1][0]\n\t\t\t\t\t-tmp1*njac[k-1][1][0];\n\t\t\t\tlhs[k][AA][2][0]=-tmp2*fjac[k-1][2][0]\n\t\t\t\t\t-tmp1*njac[k-1][2][0];\n\t\t\t\tlhs[k][AA][3][0]=-tmp2*fjac[k-1][3][0]\n\t\t\t\t\t-tmp1*njac[k-1][3][0];\n\t\t\t\tlhs[k][AA][4][0]=-tmp2*fjac[k-1][4][0]\n\t\t\t\t\t-tmp1*njac[k-1][4][0];\n\t\t\t\tlhs[k][AA][0][1]=-tmp2*fjac[k-1][0][1]\n\t\t\t\t\t-tmp1*njac[k-1][0][1];\n\t\t\t\tlhs[k][AA][1][1]=-tmp2*fjac[k-1][1][1]\n\t\t\t\t\t-tmp1*njac[k-1][1][1]\n\t\t\t\t\t-tmp1*dz2;\n\t\t\t\tlhs[k][AA][2][1]=-tmp2*fjac[k-1][2][1]\n\t\t\t\t\t-tmp1*njac[k-1][2][1];\n\t\t\t\tlhs[k][AA][3][1]=-tmp2*fjac[k-1][3][1]\n\t\t\t\t\t-tmp1*njac[k-1][3][1];\n\t\t\t\tlhs[k][AA][4][1]=-tmp2*fjac[k-1][4][1]\n\t\t\t\t\t-tmp1*njac[k-1][4][1];\n\t\t\t\tlhs[k][AA][0][2]=-tmp2*fjac[k-1][0][2]\n\t\t\t\t\t-tmp1*njac[k-1][0][2];\n\t\t\t\tlhs[k][AA][1][2]=-tmp2*fjac[k-1][1][2]\n\t\t\t\t\t-tmp1*njac[k-1][1][2];\n\t\t\t\tlhs[k][AA][2][2]=-tmp2*fjac[k-1][2][2]\n\t\t\t\t\t-tmp1*njac[k-1][2][2]\n\t\t\t\t\t-tmp1*dz3;\n\t\t\t\tlhs[k][AA][3][2]=-tmp2*fjac[k-1][3][2]\n\t\t\t\t\t-tmp1*njac[k-1][3][2];\n\t\t\t\tlhs[k][AA][4][2]=-tmp2*fjac[k-1][4][2]\n\t\t\t\t\t-tmp1*njac[k-1][4][2];\n\t\t\t\tlhs[k][AA][0][3]=-tmp2*fjac[k-1][0][3]\n\t\t\t\t\t-tmp1*njac[k-1][0][3];\n\t\t\t\tlhs[k][AA][1][3]=-tmp2*fjac[k-1][1][3]\n\t\t\t\t\t-tmp1*njac[k-1][1][3];\n\t\t\t\tlhs[k][AA][2][3]=-tmp2*fjac[k-1][2][3]\n\t\t\t\t\t-tmp1*njac[k-1][2][3];\n\t\t\t\tlhs[k][AA][3][3]=-tmp2*fjac[k-1][3][3]\n\t\t\t\t\t-tmp1*njac[k-1][3][3]\n\t\t\t\t\t-tmp1*dz4;\n\t\t\t\tlhs[k][AA][4][3]=-tmp2*fjac[k-1][4][3]\n\t\t\t\t\t-tmp1*njac[k-1][4][3];\n\t\t\t\tlhs[k][AA][0][4]=-tmp2*fjac[k-1][0][4]\n\t\t\t\t\t-tmp1*njac[k-1][0][4];\n\t\t\t\tlhs[k][AA][1][4]=-tmp2*fjac[k-1][1][4]\n\t\t\t\t\t-tmp1*njac[k-1][1][4];\n\t\t\t\tlhs[k][AA][2][4]=-tmp2*fjac[k-1][2][4]\n\t\t\t\t\t-tmp1*njac[k-1][2][4];\n\t\t\t\tlhs[k][AA][3][4]=-tmp2*fjac[k-1][3][4]\n\t\t\t\t\t-tmp1*njac[k-1][3][4];\n\t\t\t\tlhs[k][AA][4][4]=-tmp2*fjac[k-1][4][4]\n\t\t\t\t\t-tmp1*njac[k-1][4][4]\n\t\t\t\t\t-tmp1*dz5;\n\t\t\t\tlhs[k][BB][0][0]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[k][0][0]\n\t\t\t\t\t+tmp1*2.0*dz1;\n\t\t\t\tlhs[k][BB][1][0]=tmp1*2.0*njac[k][1][0];\n\t\t\t\tlhs[k][BB][2][0]=tmp1*2.0*njac[k][2][0];\n\t\t\t\tlhs[k][BB][3][0]=tmp1*2.0*njac[k][3][0];\n\t\t\t\tlhs[k][BB][4][0]=tmp1*2.0*njac[k][4][0];\n\t\t\t\tlhs[k][BB][0][1]=tmp1*2.0*njac[k][0][1];\n\t\t\t\tlhs[k][BB][1][1]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[k][1][1]\n\t\t\t\t\t+tmp1*2.0*dz2;\n\t\t\t\tlhs[k][BB][2][1]=tmp1*2.0*njac[k][2][1];\n\t\t\t\tlhs[k][BB][3][1]=tmp1*2.0*njac[k][3][1];\n\t\t\t\tlhs[k][BB][4][1]=tmp1*2.0*njac[k][4][1];\n\t\t\t\tlhs[k][BB][0][2]=tmp1*2.0*njac[k][0][2];\n\t\t\t\tlhs[k][BB][1][2]=tmp1*2.0*njac[k][1][2];\n\t\t\t\tlhs[k][BB][2][2]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[k][2][2]\n\t\t\t\t\t+tmp1*2.0*dz3;\n\t\t\t\tlhs[k][BB][3][2]=tmp1*2.0*njac[k][3][2];\n\t\t\t\tlhs[k][BB][4][2]=tmp1*2.0*njac[k][4][2];\n\t\t\t\tlhs[k][BB][0][3]=tmp1*2.0*njac[k][0][3];\n\t\t\t\tlhs[k][BB][1][3]=tmp1*2.0*njac[k][1][3];\n\t\t\t\tlhs[k][BB][2][3]=tmp1*2.0*njac[k][2][3];\n\t\t\t\tlhs[k][BB][3][3]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[k][3][3]\n\t\t\t\t\t+tmp1*2.0*dz4;\n\t\t\t\tlhs[k][BB][4][3]=tmp1*2.0*njac[k][4][3];\n\t\t\t\tlhs[k][BB][0][4]=tmp1*2.0*njac[k][0][4];\n\t\t\t\tlhs[k][BB][1][4]=tmp1*2.0*njac[k][1][4];\n\t\t\t\tlhs[k][BB][2][4]=tmp1*2.0*njac[k][2][4];\n\t\t\t\tlhs[k][BB][3][4]=tmp1*2.0*njac[k][3][4];\n\t\t\t\tlhs[k][BB][4][4]=1.0\n\t\t\t\t\t+tmp1*2.0*njac[k][4][4] \n\t\t\t\t\t+tmp1*2.0*dz5;\n\t\t\t\tlhs[k][CC][0][0]=tmp2*fjac[k+1][0][0]\n\t\t\t\t\t-tmp1*njac[k+1][0][0]\n\t\t\t\t\t-tmp1*dz1;\n\t\t\t\tlhs[k][CC][1][0]=tmp2*fjac[k+1][1][0]\n\t\t\t\t\t-tmp1*njac[k+1][1][0];\n\t\t\t\tlhs[k][CC][2][0]=tmp2*fjac[k+1][2][0]\n\t\t\t\t\t-tmp1*njac[k+1][2][0];\n\t\t\t\tlhs[k][CC][3][0]=tmp2*fjac[k+1][3][0]\n\t\t\t\t\t-tmp1*njac[k+1][3][0];\n\t\t\t\tlhs[k][CC][4][0]=tmp2*fjac[k+1][4][0]\n\t\t\t\t\t-tmp1*njac[k+1][4][0];\n\t\t\t\tlhs[k][CC][0][1]=tmp2*fjac[k+1][0][1]\n\t\t\t\t\t-tmp1*njac[k+1][0][1];\n\t\t\t\tlhs[k][CC][1][1]=tmp2*fjac[k+1][1][1]\n\t\t\t\t\t-tmp1*njac[k+1][1][1]\n\t\t\t\t\t-tmp1*dz2;\n\t\t\t\tlhs[k][CC][2][1]=tmp2*fjac[k+1][2][1]\n\t\t\t\t\t-tmp1*njac[k+1][2][1];\n\t\t\t\tlhs[k][CC][3][1]=tmp2*fjac[k+1][3][1]\n\t\t\t\t\t-tmp1*njac[k+1][3][1];\n\t\t\t\tlhs[k][CC][4][1]=tmp2*fjac[k+1][4][1]\n\t\t\t\t\t-tmp1*njac[k+1][4][1];\n\t\t\t\tlhs[k][CC][0][2]=tmp2*fjac[k+1][0][2]\n\t\t\t\t\t-tmp1*njac[k+1][0][2];\n\t\t\t\tlhs[k][CC][1][2]= tmp2*fjac[k+1][1][2]\n\t\t\t\t\t-tmp1*njac[k+1][1][2];\n\t\t\t\tlhs[k][CC][2][2]=tmp2*fjac[k+1][2][2]\n\t\t\t\t\t-tmp1*njac[k+1][2][2]\n\t\t\t\t\t-tmp1*dz3;\n\t\t\t\tlhs[k][CC][3][2]=tmp2*fjac[k+1][3][2]\n\t\t\t\t\t-tmp1*njac[k+1][3][2];\n\t\t\t\tlhs[k][CC][4][2]=tmp2*fjac[k+1][4][2]\n\t\t\t\t\t-tmp1*njac[k+1][4][2];\t\t\t\t\t\n\t\t\t\tlhs[k][CC][0][3]=tmp2*fjac[k+1][0][3]\n\t\t\t\t\t-tmp1*njac[k+1][0][3];\n\t\t\t\tlhs[k][CC][1][3]=tmp2*fjac[k+1][1][3]\n\t\t\t\t\t-tmp1*njac[k+1][1][3];\n\t\t\t\tlhs[k][CC][2][3]=tmp2*fjac[k+1][2][3]\n\t\t\t\t\t-tmp1*njac[k+1][2][3];\n\t\t\t\tlhs[k][CC][3][3]=tmp2*fjac[k+1][3][3]\n\t\t\t\t\t-tmp1*njac[k+1][3][3]\n\t\t\t\t\t-tmp1*dz4;\n\t\t\t\tlhs[k][CC][4][3]=tmp2*fjac[k+1][4][3]\n\t\t\t\t\t-tmp1*njac[k+1][4][3];\n\t\t\t\tlhs[k][CC][0][4]=tmp2*fjac[k+1][0][4]\n\t\t\t\t\t-tmp1*njac[k+1][0][4];\n\t\t\t\tlhs[k][CC][1][4]=tmp2*fjac[k+1][1][4]\n\t\t\t\t\t-tmp1*njac[k+1][1][4];\n\t\t\t\tlhs[k][CC][2][4]=tmp2*fjac[k+1][2][4]\n\t\t\t\t\t-tmp1*njac[k+1][2][4];\n\t\t\t\tlhs[k][CC][3][4]=tmp2*fjac[k+1][3][4]\n\t\t\t\t\t-tmp1*njac[k+1][3][4];\n\t\t\t\tlhs[k][CC][4][4]=tmp2*fjac[k+1][4][4]\n\t\t\t\t\t-tmp1*njac[k+1][4][4]\n\t\t\t\t\t-tmp1*dz5;\n\t\t\t}\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * performs guaussian elimination on this cell.\n\t\t\t *  \n\t\t\t * assumes that unpacking routines for non-first cells \n\t\t\t * preload c' and rhs' from previous cell.\n\t\t\t *  \n\t\t\t * assumed send happens outside this routine, but that\n\t\t\t * c'(KMAX) and rhs'(KMAX) will be sent to next cell.\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * outer most do loops - sweeping in i direction\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * multiply c(i,j,0) by b_inverse and copy back to c\n\t\t\t * multiply rhs(0) by b_inverse(0) and copy to rhs\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tbinvcrhs(lhs[0][BB], lhs[0][CC], rhs[0][j][i]);\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * begin inner most do loop\n\t\t\t * do all the elements of the cell unless last \n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfor(k=1; k<=ksize-1; k++){\n\t\t\t\t/*\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t * subtract A*lhs_vector(k-1) from lhs_vector(k)\n\t\t\t\t *  \n\t\t\t\t * rhs(k) = rhs(k) - A*rhs(k-1)\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t */\n\t\t\t\tmatvec_sub(lhs[k][AA], rhs[k-1][j][i], rhs[k][j][i]);\n\t\t\t\t/*\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t * B(k) = B(k) - C(k-1)*A(k)\n\t\t\t\t * matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k)\n\t\t\t\t * --------------------------------------------------------------------\n\t\t\t\t */\n\t\t\t\tmatmul_sub(lhs[k][AA], lhs[k-1][CC], lhs[k][BB]);\n\t\t\t\t/*\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t * multiply c(i,j,k) by b_inverse and copy back to c\n\t\t\t\t * multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs\n\t\t\t\t * -------------------------------------------------------------------\n\t\t\t\t */\n\t\t\t\tbinvcrhs(lhs[k][BB], lhs[k][CC], rhs[k][j][i]);\n\t\t\t}\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * now finish up special cases for last cell\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * rhs(ksize) = rhs(ksize) - A*rhs(ksize-1)\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tmatvec_sub(lhs[ksize][AA], rhs[ksize-1][j][i], rhs[ksize][j][i]);\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * B(ksize) = B(ksize) - C(ksize-1)*A(ksize)\n\t\t\t * matmul_sub(aa,i,j,ksize,c,\n\t\t\t * $ cc,i,j,ksize-1,c,bb,i,j,ksize)\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tmatmul_sub(lhs[ksize][AA], lhs[ksize-1][CC], lhs[ksize][BB]);\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * multiply rhs(ksize) by b_inverse(ksize) and copy to rhs\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tbinvrhs(lhs[ksize][BB], rhs[ksize][j][i]);\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * back solve: if last cell, then generate U(ksize)=rhs(ksize)\n\t\t\t * else assume U(ksize) is loaded in un pack backsub_info\n\t\t\t * so just use it\n\t\t\t * after u(kstart) will be sent to next cell\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfor(k=ksize-1; k>=0; k--){\n\t\t\t\tfor(m=0; m<BLOCK_SIZE; m++){\n\t\t\t\t\tfor(n=0; n<BLOCK_SIZE; n++){\n\t\t\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-lhs[k][CC][n][m]*rhs[k+1][j][i][n];\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_ZSOLVE);}\n}", "label": 2}
{"code": "\n\n#include \"../common/npb-CPP.hpp\"\n#include \"npbparams.hpp\"\n\n#define IMAX PROBLEM_SIZE\n#define JMAX PROBLEM_SIZE\n#define KMAX PROBLEM_SIZE\n#define IMAXP (IMAX/2*2)\n#define JMAXP (JMAX/2*2)\n#define T_TOTAL 1\n#define T_RHSX 2\n#define T_RHSY 3\n#define T_RHSZ 4\n#define T_RHS 5\n#define T_XSOLVE 6\n#define T_YSOLVE 7\n#define T_ZSOLVE 8\n#define T_RDIS1 9\n#define T_RDIS2 10\n#define T_TXINVR 11\n#define T_PINVR 12\n#define T_NINVR 13\n#define T_TZETAR 14\n#define T_ADD 15\n#define T_LAST 15\n\n/* global variables */\n#if defined(DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION)\nstatic double u[KMAX][JMAXP+1][IMAXP+1][5];\nstatic double us[KMAX][JMAXP+1][IMAXP+1];\nstatic double vs[KMAX][JMAXP+1][IMAXP+1];\nstatic double ws[KMAX][JMAXP+1][IMAXP+1];\nstatic double qs[KMAX][JMAXP+1][IMAXP+1];\nstatic double rho_i[KMAX][JMAXP+1][IMAXP+1];\nstatic double speed[KMAX][JMAXP+1][IMAXP+1];\nstatic double square[KMAX][JMAXP+1][IMAXP+1];\nstatic double rhs[KMAX][JMAXP+1][IMAXP+1][5];\nstatic double forcing[KMAX][JMAXP+1][IMAXP+1][5];\nstatic double cv[PROBLEM_SIZE];\nstatic double rhon[PROBLEM_SIZE];\nstatic double rhos[PROBLEM_SIZE];\nstatic double rhoq[PROBLEM_SIZE];\nstatic double cuf[PROBLEM_SIZE];\nstatic double q[PROBLEM_SIZE];\nstatic double ue[5][PROBLEM_SIZE];\nstatic double buf[5][PROBLEM_SIZE];\nstatic double lhs[IMAXP+1][IMAXP+1][5];\nstatic double lhsp[IMAXP+1][IMAXP+1][5];\nstatic double lhsm[IMAXP+1][IMAXP+1][5];\nstatic double ce[13][5];\n#else\nstatic double (*u)[JMAXP+1][IMAXP+1][5]=(double(*)[JMAXP+1][IMAXP+1][5])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)*(5)));\nstatic double (*us)[JMAXP+1][IMAXP+1]=(double(*)[JMAXP+1][IMAXP+1])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)));\nstatic double (*vs)[JMAXP+1][IMAXP+1]=(double(*)[JMAXP+1][IMAXP+1])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)));\nstatic double (*ws)[JMAXP+1][IMAXP+1]=(double(*)[JMAXP+1][IMAXP+1])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)));\nstatic double (*qs)[JMAXP+1][IMAXP+1]=(double(*)[JMAXP+1][IMAXP+1])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)));\nstatic double (*rho_i)[JMAXP+1][IMAXP+1]=(double(*)[JMAXP+1][IMAXP+1])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)));\nstatic double (*speed)[JMAXP+1][IMAXP+1]=(double(*)[JMAXP+1][IMAXP+1])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)));\nstatic double (*square)[JMAXP+1][IMAXP+1]=(double(*)[JMAXP+1][IMAXP+1])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)));\nstatic double (*rhs)[JMAXP+1][IMAXP+1][5]=(double(*)[JMAXP+1][IMAXP+1][5])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)*(5)));\nstatic double (*forcing)[JMAXP+1][IMAXP+1][5]=(double(*)[JMAXP+1][IMAXP+1][5])malloc(sizeof(double)*((KMAX)*(JMAXP+1)*(IMAXP+1)*(5)));\nstatic double (*cv)=(double*)malloc(sizeof(double)*(PROBLEM_SIZE));\nstatic double (*rhon)=(double*)malloc(sizeof(double)*(PROBLEM_SIZE));\nstatic double (*rhos)=(double*)malloc(sizeof(double)*(PROBLEM_SIZE));\nstatic double (*rhoq)=(double*)malloc(sizeof(double)*(PROBLEM_SIZE));\nstatic double (*cuf)=(double*)malloc(sizeof(double)*(PROBLEM_SIZE));\nstatic double (*q)=(double*)malloc(sizeof(double)*(PROBLEM_SIZE));\nstatic double (*ue)[PROBLEM_SIZE]=(double(*)[PROBLEM_SIZE])malloc(sizeof(double)*((PROBLEM_SIZE)*(5)));\nstatic double (*buf)[PROBLEM_SIZE]=(double(*)[PROBLEM_SIZE])malloc(sizeof(double)*((PROBLEM_SIZE)*(5)));\nstatic double (*lhs)[IMAXP+1][5]=(double(*)[IMAXP+1][5])malloc(sizeof(double)*((IMAXP+1)*(IMAXP+1)*(5)));\nstatic double (*lhsp)[IMAXP+1][5]=(double(*)[IMAXP+1][5])malloc(sizeof(double)*((IMAXP+1)*(IMAXP+1)*(5)));\nstatic double (*lhsm)[IMAXP+1][5]=(double(*)[IMAXP+1][5])malloc(sizeof(double)*((IMAXP+1)*(IMAXP+1)*(5)));\nstatic double (*ce)[5]=(double(*)[5])malloc(sizeof(double)*((13)*(5)));\n#endif\nstatic double tx1, tx2, tx3,ty1, ty2, ty3, tz1, tz2, tz3, \n\t      dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, \n\t      dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, \n\t      dxmax, dymax, dzmax, xxcon1, xxcon2, \n\t      xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,\n\t      dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,\n\t      yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,\n\t      zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, \n\t      dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, \n\t      dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, \n\t      c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,\n\t      dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, \n\t      c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, \n\t      c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16;\nstatic int grid_points[3], nx2, ny2, nz2;\nstatic boolean timeron;\n\n/* function prototypes */\nvoid add();\nvoid adi();\nvoid compute_rhs();\nvoid error_norm(double rms[]);\nvoid exact_rhs();\nvoid exact_solution(double xi, double eta, double zeta, double dtemp[]);\nvoid initialize();\nvoid lhsinit(int ni, int nj);\nvoid lhsinitj(int nj, int ni);\nvoid ninvr();\nvoid pinvr();\nvoid rhs_norm(double rms[]);\nvoid set_constants();\nvoid txinvr();\nvoid tzetar();\nvoid verify(int no_time_steps, char* class_npb, boolean* verified);\nvoid x_solve();\nvoid y_solve();\nvoid z_solve();\n\n/* sp */\nint main(int argc, char* argv[]){\n#if defined(DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION)\n\tprintf(\" DO_NOT_ALLOCATE_ARRAYS_WITH_DYNAMIC_MEMORY_AND_AS_SINGLE_DIMENSION mode on\\n\");\n#endif\n\tint i, niter, step, n3;\n\tdouble mflops, t, tmax, trecs[T_LAST+1];\n\tboolean verified;\n\tchar class_npb;\n\tchar* t_names[T_LAST+1];\n\t/*\n\t * ---------------------------------------------------------------------\n\t * read input file (if it exists), else take\n\t * defaults from parameters\n\t * ---------------------------------------------------------------------\n\t */\n\tFILE* fp;\n\tif((fp=fopen(\"inputsp.data\",\"r\"))!=NULL){\n\t\tint result;\n\t\tprintf(\" Reading from input file inputsp.data\\n\");\n\t\tresult=fscanf(fp,\"%d\", &niter);\n\t\twhile(fgetc(fp)!='\\n');\n\t\tresult=fscanf(fp,\"%lf\",&dt);\n\t\twhile(fgetc(fp)!='\\n');\n\t\tresult=fscanf(fp,\"%d%d%d\",&grid_points[0],&grid_points[1],&grid_points[2]);\n\t\tfclose(fp);\n\t}else{\n\t\tprintf(\" No input file inputsp.data. Using compiled defaults\\n\");\n\t\tniter=NITER_DEFAULT;\n\t\tdt=DT_DEFAULT;\n\t\tgrid_points[0]=PROBLEM_SIZE;\n\t\tgrid_points[1]=PROBLEM_SIZE;\n\t\tgrid_points[2]=PROBLEM_SIZE;\n\t}\n\tif((fp=fopen(\"timer.flag\",\"r\"))!=NULL){\n\t\ttimeron=TRUE;\n\t\tt_names[T_TOTAL]=(char*)\"total\";\n\t\tt_names[T_RHSX]=(char*)\"rhsx\";\n\t\tt_names[T_RHSY]=(char*)\"rhsy\";\n\t\tt_names[T_RHSZ]=(char*)\"rhsz\";\n\t\tt_names[T_RHS]=(char*)\"rhs\";\n\t\tt_names[T_XSOLVE]=(char*)\"xsolve\";\n\t\tt_names[T_YSOLVE]=(char*)\"ysolve\";\n\t\tt_names[T_ZSOLVE]=(char*)\"zsolve\";\n\t\tt_names[T_RDIS1]=(char*)\"redist1\";\n\t\tt_names[T_RDIS2]=(char*)\"redist2\";\n\t\tt_names[T_TZETAR]=(char*)\"tzetar\";\n\t\tt_names[T_NINVR]=(char*)\"ninvr\";\n\t\tt_names[T_PINVR]=(char*)\"pinvr\";\n\t\tt_names[T_TXINVR]=(char*)\"txinvr\";\n\t\tt_names[T_ADD]=(char*)\"add\";\n\t\tfclose(fp);\n\t}else{\n\t\ttimeron = FALSE;\n\t}\t\n\tprintf(\"\\n\\n NAS Parallel Benchmarks 4.1 Serial C++ version - SP Benchmark\\n\\n\");\n\tprintf(\" Size: %4dx%4dx%4d\\n\",grid_points[0],grid_points[1],grid_points[2]);\n\tprintf(\" Iterations: %4d    dt: %10.6f\\n\",niter,dt);\n\tprintf(\"\\n\");\n\tif((grid_points[0]>IMAX)||(grid_points[1]>JMAX)||(grid_points[2]>KMAX)){\n\t\tprintf(\" %d, %d, %d\\n\",grid_points[0],grid_points[1],grid_points[2]);\n\t\tprintf(\" Problem size too big for compiled array sizes\\n\");\n\t\treturn 0;\n\t}\n\tnx2=grid_points[0]-2;\n\tny2=grid_points[1]-2;\n\tnz2=grid_points[2]-2;\n\tset_constants();\n\tfor(i=1;i<=T_LAST;i++){timer_clear(i);}\n\texact_rhs();\n\tinitialize();\n\t/*\n\t * ---------------------------------------------------------------------\n\t * do one time step to touch all code, and reinitialize\n\t * ---------------------------------------------------------------------\n\t */\n\tadi();\n\tinitialize();\n\tfor(i=1;i<=T_LAST;i++){timer_clear(i);}\n\ttimer_start(1);\n\tfor(step=1;step<=niter;step++){\n\t\tif((step%20)==0||step==1){printf(\" Time step %4d\\n\",step);}\n\t\tadi();\n\t}\n\ttimer_stop(1);\n\ttmax=timer_read(1);\n\tverify(niter, &class_npb, &verified);\n\tif(tmax!=0.0){\n\t\tn3=grid_points[0]*grid_points[1]*grid_points[2];\n\t\tt=(grid_points[0]+grid_points[1]+grid_points[2])/3.0;\n\t\tmflops=(881.174*(double)n3-\n\t\t\t\t4683.91*(t*t)+\n\t\t\t\t11484.5*t-\n\t\t\t\t19272.4)*(double)niter/(tmax*1000000.0);\n\t}else{\n\t\tmflops=0.0;\n\t}\n\tc_print_results((char*)\"SP\",\n\t\t\tclass_npb,\n\t\t\tgrid_points[0],\n\t\t\tgrid_points[1],\n\t\t\tgrid_points[2],\n\t\t\tniter,\n\t\t\ttmax,\n\t\t\tmflops,\n\t\t\t(char*)\"          floating point\",\n\t\t\tverified,\n\t\t\t(char*)NPBVERSION,\n\t\t\t(char*)COMPILETIME,\n\t\t\t(char*)COMPILERVERSION,\n\t\t\t(char*)CS1,\n\t\t\t(char*)CS2,\n\t\t\t(char*)CS3,\n\t\t\t(char*)CS4,\n\t\t\t(char*)CS5,\n\t\t\t(char*)CS6,\n\t\t\t(char*)\"(none)\");\n\t/*\n\t * ---------------------------------------------------------------------\n\t * more timers\n\t * ---------------------------------------------------------------------\n\t */\n\tif(timeron){\n\t\tfor(i=1; i<=T_LAST; i++){\n\t\t\ttrecs[i]=timer_read(i);\n\t\t}\n\t\tif(tmax==0.0){tmax=1.0;}\n\t\tprintf(\"  SECTION   Time (secs)\\n\");\n\t\tfor(i=1; i<=T_LAST; i++){\n\t\t\tprintf(\"  %-8s:%9.3f  (%6.2f%%)\\n\",t_names[i],trecs[i],trecs[i]*100./tmax);\n\t\t\tif(i==T_RHS){\n\t\t\t\tt=trecs[T_RHSX]+trecs[T_RHSY]+trecs[T_RHSZ];\n\t\t\t\tprintf(\"    --> %8s:%9.3f  (%6.2f%%)\\n\",\"sub-rhs\",t,t*100./tmax);\n\t\t\t\tt=trecs[T_RHS]-t;\n\t\t\t\tprintf(\"    --> %8s:%9.3f  (%6.2f%%)\\n\",\"rest-rhs\",t,t*100./tmax);\n\t\t\t}else if(i==T_ZSOLVE){\n\t\t\t\tt=trecs[T_ZSOLVE]-trecs[T_RDIS1]-trecs[T_RDIS2];\n\t\t\t\tprintf(\"    --> %8s:%9.3f  (%6.2f%%)\\n\",\"sub-zsol\",t,t*100./tmax);\n\t\t\t}else if(i==T_RDIS2){\n\t\t\t\tt=trecs[T_RDIS1]+trecs[T_RDIS2];\n\t\t\t\tprintf(\"    --> %8s:%9.3f  (%6.2f%%)\\n\",\"redist\",t,t*100./tmax);\n\t\t\t}\n\t\t}\n\t}\n\treturn 0;\n}\n\n/*\n * ---------------------------------------------------------------------\n * addition of update to the vector u\n * ---------------------------------------------------------------------\n */\nvoid add(){\n\tint i, j, k, m;\n\tif(timeron){timer_start(T_ADD);}\n\tfor(k=1; k<=nz2; k++){\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tu[k][j][i][m]=u[k][j][i][m]+rhs[k][j][i][m];\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_ADD);}\n}\n\nvoid adi(){\n\tcompute_rhs();\n\ttxinvr();\n\tx_solve();\n\ty_solve();\n\tz_solve();\n\tadd();\n}\n\nvoid compute_rhs(){\n\tint i, j, k, m;\n\tdouble aux, rho_inv, uijk, up1, um1, vijk, vp1, vm1, wijk, wp1, wm1;\n\tif(timeron){timer_start(T_RHS);}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * compute the reciprocal of density, and the kinetic energy, \n\t * and the speed of sound. \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\trho_inv=1.0/u[k][j][i][0];\n\t\t\t\trho_i[k][j][i]=rho_inv;\n\t\t\t\tus[k][j][i]=u[k][j][i][1]*rho_inv;\n\t\t\t\tvs[k][j][i]=u[k][j][i][2]*rho_inv;\n\t\t\t\tws[k][j][i]=u[k][j][i][3]*rho_inv;\n\t\t\t\tsquare[k][j][i]=0.5*(\n\t\t\t\t\t\tu[k][j][i][1]*u[k][j][i][1]+ \n\t\t\t\t\t\tu[k][j][i][2]*u[k][j][i][2]+\n\t\t\t\t\t\tu[k][j][i][3]*u[k][j][i][3])*rho_inv;\n\t\t\t\tqs[k][j][i]=square[k][j][i]*rho_inv;\n\t\t\t\t/*\n\t\t\t\t * ---------------------------------------------------------------------\n\t\t\t\t * (don't need speed and ainx until the lhs computation)\n\t\t\t\t * ---------------------------------------------------------------------\n\t\t\t\t */\n\t\t\t\taux=c1c2*rho_inv*(u[k][j][i][4]-square[k][j][i]);\n\t\t\t\tspeed[k][j][i]=sqrt(aux);\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * copy the exact forcing term to the right hand side;  because \n\t * this forcing term is known, we can store it on the whole grid\n\t * including the boundary                   \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\trhs[k][j][i][m]=forcing[k][j][i][m];\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * compute xi-direction fluxes \n\t * ---------------------------------------------------------------------\n\t */\n\tif(timeron){timer_start(T_RHSX);}\n\tfor(k=1; k<=nz2; k++){\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tuijk=us[k][j][i];\n\t\t\t\tup1=us[k][j][i+1];\n\t\t\t\tum1=us[k][j][i-1];\n\t\t\t\trhs[k][j][i][0]=rhs[k][j][i][0]+dx1tx1* \n\t\t\t\t\t(u[k][j][i+1][0]-2.0*u[k][j][i][0]+u[k][j][i-1][0])-\n\t\t\t\t\ttx2*(u[k][j][i+1][1]-u[k][j][i-1][1]);\n\t\t\t\trhs[k][j][i][1]=rhs[k][j][i][1]+dx2tx1* \n\t\t\t\t\t(u[k][j][i+1][1]-2.0*u[k][j][i][1]+u[k][j][i-1][1])+\n\t\t\t\t\txxcon2*con43*(up1-2.0*uijk+um1)-\n\t\t\t\t\ttx2*(u[k][j][i+1][1]*up1-u[k][j][i-1][1]*um1+\n\t\t\t\t\t\t\t(u[k][j][i+1][4]-square[k][j][i+1]-\n\t\t\t\t\t\t\t u[k][j][i-1][4]+square[k][j][i-1])*c2);\n\t\t\t\trhs[k][j][i][2]=rhs[k][j][i][2]+dx3tx1* \n\t\t\t\t\t(u[k][j][i+1][2]-2.0*u[k][j][i][2]+u[k][j][i-1][2])+\n\t\t\t\t\txxcon2*(vs[k][j][i+1]-2.0*vs[k][j][i]+vs[k][j][i-1])-\n\t\t\t\t\ttx2*(u[k][j][i+1][2]*up1-u[k][j][i-1][2]*um1);\n\t\t\t\trhs[k][j][i][3]=rhs[k][j][i][3]+dx4tx1* \n\t\t\t\t\t(u[k][j][i+1][3]-2.0*u[k][j][i][3]+u[k][j][i-1][3])+\n\t\t\t\t\txxcon2*(ws[k][j][i+1]-2.0*ws[k][j][i]+ws[k][j][i-1])-\n\t\t\t\t\ttx2*(u[k][j][i+1][3]*up1-u[k][j][i-1][3]*um1);\n\t\t\t\trhs[k][j][i][4]=rhs[k][j][i][4]+dx5tx1* \n\t\t\t\t\t(u[k][j][i+1][4]-2.0*u[k][j][i][4]+u[k][j][i-1][4])+\n\t\t\t\t\txxcon3*(qs[k][j][i+1]-2.0*qs[k][j][i]+qs[k][j][i-1])+\n\t\t\t\t\txxcon4*(up1*up1-2.0*uijk*uijk+um1*um1)+\n\t\t\t\t\txxcon5*(u[k][j][i+1][4]*rho_i[k][j][i+1]- \n\t\t\t\t\t\t\t2.0*u[k][j][i][4]*rho_i[k][j][i]+\n\t\t\t\t\t\t\tu[k][j][i-1][4]*rho_i[k][j][i-1])-\n\t\t\t\t\ttx2*((c1*u[k][j][i+1][4]-c2*square[k][j][i+1])*up1-\n\t\t\t\t\t\t\t(c1*u[k][j][i-1][4]-c2*square[k][j][i-1])*um1);\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * add fourth order xi-direction dissipation               \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\ti=1;\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t(5.0*u[k][j][i][m]-4.0*u[k][j][i+1][m]+u[k][j][i+2][m]);\n\t\t\t}\n\t\t\ti=2;\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t(-4.0*u[k][j][i-1][m]+6.0*u[k][j][i][m]-\n\t\t\t\t\t 4.0*u[k][j][i+1][m]+u[k][j][i+2][m]);\n\t\t\t}\n\t\t}\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=3; i<=nx2-2; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t\t(u[k][j][i-2][m]-4.0*u[k][j][i-1][m]+ \n\t\t\t\t\t\t 6.0*u[k][j][i][m]-4.0*u[k][j][i+1][m]+ \n\t\t\t\t\t\t u[k][j][i+2][m]);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\ti=nx2-1;\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m] = rhs[k][j][i][m]-dssp*\n\t\t\t\t\t(u[k][j][i-2][m]-4.0*u[k][j][i-1][m]+ \n\t\t\t\t\t 6.0*u[k][j][i][m]-4.0*u[k][j][i+1][m]);\n\t\t\t}\n\t\t\ti=nx2;\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m] = rhs[k][j][i][m]-dssp*\n\t\t\t\t\t(u[k][j][i-2][m]-4.0*u[k][j][i-1][m]+5.0*u[k][j][i][m]);\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_RHSX);}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * compute eta-direction fluxes \n\t * ---------------------------------------------------------------------\n\t */\n\tif(timeron){timer_start(T_RHSY);}\n\tfor(k=1; k<=nz2; k++){\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tvijk=vs[k][j][i];\n\t\t\t\tvp1=vs[k][j+1][i];\n\t\t\t\tvm1=vs[k][j-1][i];\n\t\t\t\trhs[k][j][i][0]=rhs[k][j][i][0]+dy1ty1* \n\t\t\t\t\t(u[k][j+1][i][0]-2.0*u[k][j][i][0]+u[k][j-1][i][0])-\n\t\t\t\t\tty2*(u[k][j+1][i][2]-u[k][j-1][i][2]);\n\t\t\t\trhs[k][j][i][1]=rhs[k][j][i][1]+dy2ty1* \n\t\t\t\t\t(u[k][j+1][i][1]-2.0*u[k][j][i][1]+u[k][j-1][i][1])+\n\t\t\t\t\tyycon2*(us[k][j+1][i]-2.0*us[k][j][i]+us[k][j-1][i])-\n\t\t\t\t\tty2*(u[k][j+1][i][1]*vp1-u[k][j-1][i][1]*vm1);\n\t\t\t\trhs[k][j][i][2]=rhs[k][j][i][2]+dy3ty1* \n\t\t\t\t\t(u[k][j+1][i][2]-2.0*u[k][j][i][2]+u[k][j-1][i][2])+\n\t\t\t\t\tyycon2*con43*(vp1-2.0*vijk+vm1)-\n\t\t\t\t\tty2*(u[k][j+1][i][2]*vp1-u[k][j-1][i][2]*vm1+\n\t\t\t\t\t\t\t(u[k][j+1][i][4]-square[k][j+1][i]- \n\t\t\t\t\t\t\t u[k][j-1][i][4]+square[k][j-1][i])* c2);\n\t\t\t\trhs[k][j][i][3]=rhs[k][j][i][3]+dy4ty1* \n\t\t\t\t\t(u[k][j+1][i][3]-2.0*u[k][j][i][3]+u[k][j-1][i][3])+\n\t\t\t\t\tyycon2*(ws[k][j+1][i]-2.0*ws[k][j][i]+ws[k][j-1][i])-\n\t\t\t\t\tty2*(u[k][j+1][i][3]*vp1-u[k][j-1][i][3]*vm1);\n\t\t\t\trhs[k][j][i][4]=rhs[k][j][i][4]+dy5ty1* \n\t\t\t\t\t(u[k][j+1][i][4]-2.0*u[k][j][i][4]+u[k][j-1][i][4])+\n\t\t\t\t\tyycon3*(qs[k][j+1][i]-2.0*qs[k][j][i]+qs[k][j-1][i])+\n\t\t\t\t\tyycon4*(vp1*vp1- 2.0*vijk*vijk + vm1*vm1)+\n\t\t\t\t\tyycon5*(u[k][j+1][i][4]*rho_i[k][j+1][i]- \n\t\t\t\t\t\t\t2.0*u[k][j][i][4]*rho_i[k][j][i]+\n\t\t\t\t\t\t\tu[k][j-1][i][4]*rho_i[k][j-1][i])-\n\t\t\t\t\tty2*((c1*u[k][j+1][i][4]-c2*square[k][j+1][i])*vp1 -\n\t\t\t\t\t\t\t(c1*u[k][j-1][i][4]-c2*square[k][j-1][i])*vm1);\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * add fourth order eta-direction dissipation         \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tj = 1;\n\t\tfor(i=1; i<=nx2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t(5.0*u[k][j][i][m]-4.0*u[k][j+1][i][m]+u[k][j+2][i][m]);\n\t\t\t}\n\t\t}\n\t\tj = 2;\n\t\tfor(i=1; i<=nx2; i++){\n\t\t\tfor(m = 0; m < 5; m++) {\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t(-4.0*u[k][j-1][i][m]+6.0*u[k][j][i][m]-\n\t\t\t\t\t 4.0*u[k][j+1][i][m]+u[k][j+2][i][m]);\n\t\t\t}\n\t\t}\n\t\tfor (j=3; j<=ny2-2; j++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t\t(u[k][j-2][i][m]-4.0*u[k][j-1][i][m]+ \n\t\t\t\t\t\t 6.0*u[k][j][i][m]-4.0*u[k][j+1][i][m]+ \n\t\t\t\t\t\t u[k][j+2][i][m]);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tj=ny2-1;\n\t\tfor(i=1; i<=nx2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp*\n\t\t\t\t\t(u[k][j-2][i][m]-4.0*u[k][j-1][i][m]+ \n\t\t\t\t\t 6.0*u[k][j][i][m]-4.0*u[k][j+1][i][m]);\n\t\t\t}\n\t\t}\n\t\tj=ny2;\n\t\tfor(i=1; i<=nx2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp*\n\t\t\t\t\t(u[k][j-2][i][m]-4.0*u[k][j-1][i][m]+5.0*u[k][j][i][m]);\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_RHSY);}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * compute zeta-direction fluxes \n\t * ---------------------------------------------------------------------\n\t */\n\tif(timeron){timer_start(T_RHSZ);}\n\tfor(k=1; k<=nz2; k++){\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\twijk=ws[k][j][i];\n\t\t\t\twp1=ws[k+1][j][i];\n\t\t\t\twm1=ws[k-1][j][i];\n\t\t\t\trhs[k][j][i][0]=rhs[k][j][i][0]+dz1tz1* \n\t\t\t\t\t(u[k+1][j][i][0]-2.0*u[k][j][i][0]+u[k-1][j][i][0])-\n\t\t\t\t\ttz2*(u[k+1][j][i][3]-u[k-1][j][i][3]);\n\t\t\t\trhs[k][j][i][1]=rhs[k][j][i][1]+dz2tz1* \n\t\t\t\t\t(u[k+1][j][i][1]-2.0*u[k][j][i][1]+u[k-1][j][i][1])+\n\t\t\t\t\tzzcon2*(us[k+1][j][i]-2.0*us[k][j][i]+us[k-1][j][i])-\n\t\t\t\t\ttz2*(u[k+1][j][i][1]*wp1-u[k-1][j][i][1]*wm1);\n\t\t\t\trhs[k][j][i][2]=rhs[k][j][i][2]+dz3tz1* \n\t\t\t\t\t(u[k+1][j][i][2]-2.0*u[k][j][i][2]+u[k-1][j][i][2])+\n\t\t\t\t\tzzcon2*(vs[k+1][j][i]-2.0*vs[k][j][i]+vs[k-1][j][i])-\n\t\t\t\t\ttz2*(u[k+1][j][i][2]*wp1-u[k-1][j][i][2]*wm1);\n\t\t\t\trhs[k][j][i][3]=rhs[k][j][i][3]+dz4tz1* \n\t\t\t\t\t(u[k+1][j][i][3]-2.0*u[k][j][i][3]+u[k-1][j][i][3])+\n\t\t\t\t\tzzcon2*con43*(wp1-2.0*wijk+wm1)-\n\t\t\t\t\ttz2*(u[k+1][j][i][3]*wp1-u[k-1][j][i][3]*wm1+\n\t\t\t\t\t\t\t(u[k+1][j][i][4]-square[k+1][j][i]- \n\t\t\t\t\t\t\t u[k-1][j][i][4]+square[k-1][j][i])*c2);\n\t\t\t\trhs[k][j][i][4]=rhs[k][j][i][4]+dz5tz1* \n\t\t\t\t\t(u[k+1][j][i][4]-2.0*u[k][j][i][4]+u[k-1][j][i][4])+\n\t\t\t\t\tzzcon3*(qs[k+1][j][i]-2.0*qs[k][j][i]+qs[k-1][j][i])+\n\t\t\t\t\tzzcon4*(wp1*wp1-2.0*wijk*wijk+wm1*wm1)+\n\t\t\t\t\tzzcon5*(u[k+1][j][i][4]*rho_i[k+1][j][i]- \n\t\t\t\t\t\t\t2.0*u[k][j][i][4]*rho_i[k][j][i]+\n\t\t\t\t\t\t\tu[k-1][j][i][4]*rho_i[k-1][j][i])-\n\t\t\t\t\ttz2*((c1*u[k+1][j][i][4]-c2*square[k+1][j][i])*wp1-\n\t\t\t\t\t\t\t(c1*u[k-1][j][i][4]-c2*square[k-1][j][i])*wm1);\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * add fourth order zeta-direction dissipation                \n\t * ---------------------------------------------------------------------\n\t */\n\tk=1;\n\tfor(j=1; j<=ny2; j++){\n\t\tfor(i=1; i<=nx2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t(5.0*u[k][j][i][m]-4.0*u[k+1][j][i][m]+u[k+2][j][i][m]);\n\t\t\t}\n\t\t}\n\t}\n\tk=2;\n\tfor(j=1; j<=ny2; j++){\n\t\tfor(i=1; i<=nx2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t(-4.0*u[k-1][j][i][m]+6.0*u[k][j][i][m]-\n\t\t\t\t\t 4.0*u[k+1][j][i][m]+u[k+2][j][i][m]);\n\t\t\t}\n\t\t}\n\t}\n\tfor(k=3; k<=nz2-2; k++){\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp* \n\t\t\t\t\t\t(u[k-2][j][i][m]-4.0*u[k-1][j][i][m]+ \n\t\t\t\t\t\t 6.0*u[k][j][i][m]-4.0*u[k+1][j][i][m]+ \n\t\t\t\t\t\t u[k+2][j][i][m]);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tk=nz2-1;\n\tfor(j=1; j<=ny2; j++){\n\t\tfor(i=1; i<=nx2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp*\n\t\t\t\t\t(u[k-2][j][i][m]-4.0*u[k-1][j][i][m]+ \n\t\t\t\t\t 6.0*u[k][j][i][m]-4.0*u[k+1][j][i][m]);\n\t\t\t}\n\t\t}\n\t}\n\tk=nz2;\n\tfor(j=1; j<=ny2; j++){\n\t\tfor(i=1; i<=nx2; i++){\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-dssp*\n\t\t\t\t\t(u[k-2][j][i][m]-4.0*u[k-1][j][i][m]+5.0*u[k][j][i][m]);\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_RHSZ);}\n\tfor(k=1; k<=nz2; k++){\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]*dt;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_RHS);}\n}\n\n/*\n * ---------------------------------------------------------------------\n * this function computes the norm of the difference between the\n * computed solution and the exact solution\n * ---------------------------------------------------------------------\n */\nvoid error_norm(double rms[]){\n\tint i, j, k, m, d;\n\tdouble xi, eta, zeta, u_exact[5], add;\n\tfor(m=0; m<5; m++){rms[m]=0.0;}\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tzeta=(double)k*dnzm1;\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\teta=(double)j*dnym1;\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\txi=(double)i*dnxm1;\n\t\t\t\texact_solution(xi, eta, zeta, u_exact);\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tadd=u[k][j][i][m]-u_exact[m];\n\t\t\t\t\trms[m]=rms[m]+add*add;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tfor(m=0; m<5; m++){\n\t\tfor(d=0; d<3; d++){\n\t\t\trms[m]=rms[m]/(double)(grid_points[d]-2);\n\t\t}\n\t\trms[m]=sqrt(rms[m]);\n\t}\n}\n\n/*\n * ---------------------------------------------------------------------\n * compute the right hand side based on exact solution\n * ---------------------------------------------------------------------\n */\nvoid exact_rhs(){\n\tdouble dtemp[5], xi, eta, zeta, dtpp;\n\tint m, i, j, k, ip1, im1, jp1, jm1, km1, kp1;\n\t/*\n\t * ---------------------------------------------------------------------\n\t * initialize                                  \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tfor(j=0; j<= grid_points[1]-1; j++){\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tforcing[k][j][i][m]=0.0;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * xi-direction flux differences                      \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tzeta=(double)k*dnzm1;\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\teta=(double)j*dnym1;\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\txi=(double)i*dnxm1;\n\t\t\t\texact_solution(xi, eta, zeta, dtemp);\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tue[m][i]=dtemp[m];\n\t\t\t\t}\n\t\t\t\tdtpp=1.0/dtemp[0];\n\t\t\t\tfor(m=1; m<5; m++){\n\t\t\t\t\tbuf[m][i]=dtpp*dtemp[m];\n\t\t\t\t}\n\t\t\t\tcuf[i]=buf[1][i]*buf[1][i];\n\t\t\t\tbuf[0][i]=cuf[i]+buf[2][i]*buf[2][i]+buf[3][i]*buf[3][i]; \n\t\t\t\tq[i]=0.5*(buf[1][i]*ue[1][i]+buf[2][i]*ue[2][i]+\n\t\t\t\t\t\tbuf[3][i]*ue[3][i]);\n\t\t\t}\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tim1=i-1;\n\t\t\t\tip1=i+1;\n\t\t\t\tforcing[k][j][i][0]=forcing[k][j][i][0]-\n\t\t\t\t\ttx2*(ue[1][ip1]-ue[1][im1])+\n\t\t\t\t\tdx1tx1*(ue[0][ip1]-2.0*ue[0][i]+ue[0][im1]);\n\t\t\t\tforcing[k][j][i][1]=forcing[k][j][i][1]-tx2*(\n\t\t\t\t\t\t(ue[1][ip1]*buf[1][ip1]+c2*(ue[4][ip1]-q[ip1]))-\n\t\t\t\t\t\t(ue[1][im1]*buf[1][im1]+c2*(ue[4][im1]-q[im1])))+\n\t\t\t\t\txxcon1*(buf[1][ip1]-2.0*buf[1][i]+buf[1][im1])+\n\t\t\t\t\tdx2tx1*(ue[1][ip1]-2.0*ue[1][i]+ue[1][im1]);\n\t\t\t\tforcing[k][j][i][2]=forcing[k][j][i][2]-tx2*(\n\t\t\t\t\t\tue[2][ip1]*buf[1][ip1]-ue[2][im1]*buf[1][im1])+\n\t\t\t\t\txxcon2*(buf[2][ip1]-2.0*buf[2][i]+buf[2][im1])+\n\t\t\t\t\tdx3tx1*(ue[2][ip1]-2.0*ue[2][i]+ue[2][im1]);\n\t\t\t\tforcing[k][j][i][3]=forcing[k][j][i][3]-tx2*(\n\t\t\t\t\t\tue[3][ip1]*buf[1][ip1]-ue[3][im1]*buf[1][im1])+\n\t\t\t\t\txxcon2*(buf[3][ip1]-2.0*buf[3][i]+buf[3][im1])+\n\t\t\t\t\tdx4tx1*(ue[3][ip1]-2.0*ue[3][i]+ue[3][im1]);\n\t\t\t\tforcing[k][j][i][4]=forcing[k][j][i][4]-tx2*(\n\t\t\t\t\t\tbuf[1][ip1]*(c1*ue[4][ip1]-c2*q[ip1])-\n\t\t\t\t\t\tbuf[1][im1]*(c1*ue[4][im1]-c2*q[im1]))+\n\t\t\t\t\t0.5*xxcon3*(buf[0][ip1]-2.0*buf[0][i]+buf[0][im1])+\n\t\t\t\t\txxcon4*(cuf[ip1]-2.0*cuf[i]+cuf[im1])+\n\t\t\t\t\txxcon5*(buf[4][ip1]-2.0*buf[4][i]+buf[4][im1])+\n\t\t\t\t\tdx5tx1*(ue[4][ip1]-2.0*ue[4][i]+ue[4][im1]);\n\t\t\t}\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * fourth-order dissipation                         \n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\ti=1;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(5.0*ue[m][i]-4.0*ue[m][i+1]+ue[m][i+2]);\n\t\t\t\ti=2;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(-4.0*ue[m][i-1]+6.0*ue[m][i]-\n\t\t\t\t\t 4.0*ue[m][i+1]+ue[m][i+2]);\n\t\t\t}\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tfor(i=3; i<=grid_points[0]-4; i++){\t\t\t\t\n\t\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t\t(ue[m][i-2]-4.0*ue[m][i-1]+\n\t\t\t\t\t\t 6.0*ue[m][i]-4.0*ue[m][i+1]+ue[m][i+2]);\n\t\t\t\t}\n\t\t\t}\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\ti=grid_points[0]-3;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(ue[m][i-2]-4.0*ue[m][i-1]+\n\t\t\t\t\t 6.0*ue[m][i]-4.0*ue[m][i+1]);\n\t\t\t\ti=grid_points[0]-2;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(ue[m][i-2]-4.0*ue[m][i-1]+5.0*ue[m][i]);\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * eta-direction flux differences             \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tzeta=(double)k*dnzm1;\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\txi=(double)i*dnxm1;\n\t\t\tfor(j=0;j<=grid_points[1]-1;j++){\n\t\t\t\teta=(double)j*dnym1;\n\t\t\t\texact_solution(xi, eta, zeta, dtemp);\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tue[m][j]=dtemp[m];\n\t\t\t\t}\n\t\t\t\tdtpp=1.0/dtemp[0];\n\t\t\t\tfor(m=1; m<5; m++){\n\t\t\t\t\tbuf[m][j]=dtpp*dtemp[m];\n\t\t\t\t}\n\t\t\t\tcuf[j]=buf[2][j]*buf[2][j];\n\t\t\t\tbuf[0][j]=cuf[j]+buf[1][j]*buf[1][j]+buf[3][j]*buf[3][j];\n\t\t\t\tq[j]=0.5*(buf[1][j]*ue[1][j]+buf[2][j]*ue[2][j]+\n\t\t\t\t\t\tbuf[3][j]*ue[3][j]);\n\t\t\t}\n\t\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\t\tjm1=j-1;\n\t\t\t\tjp1=j+1;\n\t\t\t\tforcing[k][j][i][0]=forcing[k][j][i][0]-\n\t\t\t\t\tty2*(ue[2][jp1]-ue[2][jm1])+\n\t\t\t\t\tdy1ty1*(ue[0][jp1]-2.0*ue[0][j]+ue[0][jm1]);\n\t\t\t\tforcing[k][j][i][1]=forcing[k][j][i][1]-ty2*(\n\t\t\t\t\t\tue[1][jp1]*buf[2][jp1]-ue[1][jm1]*buf[2][jm1])+\n\t\t\t\t\tyycon2*(buf[1][jp1]-2.0*buf[1][j]+buf[1][jm1])+\n\t\t\t\t\tdy2ty1*(ue[1][jp1]-2.0*ue[1][j]+ue[1][jm1]);\n\t\t\t\tforcing[k][j][i][2]=forcing[k][j][i][2]-ty2*(\n\t\t\t\t\t\t(ue[2][jp1]*buf[2][jp1]+c2*(ue[4][jp1]-q[jp1]))-\n\t\t\t\t\t\t(ue[2][jm1]*buf[2][jm1]+c2*(ue[4][jm1]-q[jm1])))+\n\t\t\t\t\tyycon1*(buf[2][jp1]-2.0*buf[2][j]+buf[2][jm1])+\n\t\t\t\t\tdy3ty1*(ue[2][jp1]-2.0*ue[2][j]+ue[2][jm1]);\n\t\t\t\tforcing[k][j][i][3]=forcing[k][j][i][3]-ty2*(\n\t\t\t\t\t\tue[3][jp1]*buf[2][jp1]-ue[3][jm1]*buf[2][jm1])+\n\t\t\t\t\tyycon2*(buf[3][jp1]-2.0*buf[3][j]+buf[3][jm1])+\n\t\t\t\t\tdy4ty1*(ue[3][jp1]-2.0*ue[3][j]+ue[3][jm1]);\n\t\t\t\tforcing[k][j][i][4]=forcing[k][j][i][4]-ty2*(\n\t\t\t\t\t\tbuf[2][jp1]*(c1*ue[4][jp1]-c2*q[jp1])-\n\t\t\t\t\t\tbuf[2][jm1]*(c1*ue[4][jm1]-c2*q[jm1]))+\n\t\t\t\t\t0.5*yycon3*(buf[0][jp1]-2.0*buf[0][j]+\n\t\t\t\t\t\t\tbuf[0][jm1])+\n\t\t\t\t\tyycon4*(cuf[jp1]-2.0*cuf[j]+cuf[jm1])+\n\t\t\t\t\tyycon5*(buf[4][jp1]-2.0*buf[4][j]+buf[4][jm1])+\n\t\t\t\t\tdy5ty1*(ue[4][jp1]-2.0*ue[4][j]+ue[4][jm1]);\n\t\t\t}\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * fourth-order dissipation                      \n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tj=1;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(5.0*ue[m][j]-4.0*ue[m][j+1]+ue[m][j+2]);\n\t\t\t\tj=2;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(-4.0*ue[m][j-1]+6.0*ue[m][j]-\n\t\t\t\t\t 4.0*ue[m][j+1]+ue[m][j+2]);\n\t\t\t}\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tfor(j=3; j<=grid_points[1]-4; j++){\t\t\t\t\n\t\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t\t(ue[m][j-2]-4.0*ue[m][j-1]+\n\t\t\t\t\t\t 6.0*ue[m][j]-4.0*ue[m][j+1]+ue[m][j+2]);\n\t\t\t\t}\n\t\t\t}\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tj=grid_points[1]-3;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(ue[m][j-2]-4.0*ue[m][j-1]+\n\t\t\t\t\t 6.0*ue[m][j]-4.0*ue[m][j+1]);\n\t\t\t\tj=grid_points[1]-2;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(ue[m][j-2]-4.0*ue[m][j-1]+5.0*ue[m][j]);\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * zeta-direction flux differences                      \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\teta=(double)j*dnym1;\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\txi=(double)i*dnxm1;\n\t\t\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\t\t\tzeta=(double)k*dnzm1;\n\t\t\t\texact_solution(xi, eta, zeta, dtemp);\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tue[m][k]=dtemp[m];\n\t\t\t\t}\n\t\t\t\tdtpp=1.0/dtemp[0];\n\t\t\t\tfor(m=1; m<5; m++){\n\t\t\t\t\tbuf[m][k]=dtpp*dtemp[m];\n\t\t\t\t}\n\t\t\t\tcuf[k]=buf[3][k]*buf[3][k];\n\t\t\t\tbuf[0][k]=cuf[k]+buf[1][k]*buf[1][k]+buf[2][k]*buf[2][k];\n\t\t\t\tq[k]=0.5*(buf[1][k]*ue[1][k]+buf[2][k]*ue[2][k]+\n\t\t\t\t\t\tbuf[3][k]*ue[3][k]);\n\t\t\t}\n\t\t\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\t\t\tkm1=k-1;\n\t\t\t\tkp1=k+1;\n\t\t\t\tforcing[k][j][i][0]=forcing[k][j][i][0]-\n\t\t\t\t\ttz2*(ue[3][kp1]-ue[3][km1])+\n\t\t\t\t\tdz1tz1*(ue[0][kp1]-2.0*ue[0][k]+ue[0][km1]);\n\t\t\t\tforcing[k][j][i][1]=forcing[k][j][i][1]-tz2*(\n\t\t\t\t\t\tue[1][kp1]*buf[3][kp1]-ue[1][km1]*buf[3][km1])+\n\t\t\t\t\tzzcon2*(buf[1][kp1]-2.0*buf[1][k]+buf[1][km1])+\n\t\t\t\t\tdz2tz1*(ue[1][kp1]-2.0*ue[1][k]+ue[1][km1]);\n\t\t\t\tforcing[k][j][i][2]=forcing[k][j][i][2]-tz2*(\n\t\t\t\t\t\tue[2][kp1]*buf[3][kp1]-ue[2][km1]*buf[3][km1])+\n\t\t\t\t\tzzcon2*(buf[2][kp1]-2.0*buf[2][k]+buf[2][km1])+\n\t\t\t\t\tdz3tz1*(ue[2][kp1]-2.0*ue[2][k]+ue[2][km1]);\n\t\t\t\tforcing[k][j][i][3]=forcing[k][j][i][3]-tz2*(\n\t\t\t\t\t\t(ue[3][kp1]*buf[3][kp1]+c2*(ue[4][kp1]-q[kp1]))-\n\t\t\t\t\t\t(ue[3][km1]*buf[3][km1]+c2*(ue[4][km1]-q[km1])))+\n\t\t\t\t\tzzcon1*(buf[3][kp1]-2.0*buf[3][k]+buf[3][km1])+\n\t\t\t\t\tdz4tz1*(ue[3][kp1]-2.0*ue[3][k]+ue[3][km1]);\n\t\t\t\tforcing[k][j][i][4]=forcing[k][j][i][4]-tz2*(\n\t\t\t\t\t\tbuf[3][kp1]*(c1*ue[4][kp1]-c2*q[kp1])-\n\t\t\t\t\t\tbuf[3][km1]*(c1*ue[4][km1]-c2*q[km1]))+\n\t\t\t\t\t0.5*zzcon3*(buf[0][kp1]-2.0*buf[0][k]+buf[0][km1])+\n\t\t\t\t\tzzcon4*(cuf[kp1]-2.0*cuf[k]+cuf[km1])+\n\t\t\t\t\tzzcon5*(buf[4][kp1]-2.0*buf[4][k]+buf[4][km1])+\n\t\t\t\t\tdz5tz1*(ue[4][kp1]-2.0*ue[4][k]+ue[4][km1]);\n\t\t\t}\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * fourth-order dissipation\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tk=1;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(5.0*ue[m][k]-4.0*ue[m][k+1]+ue[m][k+2]);\n\t\t\t\tk=2;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(-4.0*ue[m][k-1]+6.0*ue[m][k]-\n\t\t\t\t\t 4.0*ue[m][k+1]+ue[m][k+2]);\n\t\t\t}\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tfor(k=3; k<=grid_points[2]-4; k++){\t\t\t\t\n\t\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t\t(ue[m][k-2]-4.0*ue[m][k-1]+\n\t\t\t\t\t\t 6.0*ue[m][k]-4.0*ue[m][k+1]+ue[m][k+2]);\n\t\t\t\t}\n\t\t\t}\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tk=grid_points[2]-3;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(ue[m][k-2]-4.0*ue[m][k-1]+\n\t\t\t\t\t 6.0*ue[m][k]-4.0*ue[m][k+1]);\n\t\t\t\tk=grid_points[2]-2;\n\t\t\t\tforcing[k][j][i][m]=forcing[k][j][i][m]-dssp*\n\t\t\t\t\t(ue[m][k-2]-4.0*ue[m][k-1]+5.0*ue[m][k]);\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * now change the sign of the forcing function\n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tforcing[k][j][i][m]=-1.0*forcing[k][j][i][m];\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n}\n\n/*\n * ---------------------------------------------------------------------\n * this function returns the exact solution at point xi, eta, zeta  \n * ---------------------------------------------------------------------\n */\nvoid exact_solution(double xi, double eta, double zeta, double dtemp[]){\n\tint m;\n\tfor(m=0; m<5; m++){\n\t\tdtemp[m]=ce[0][m]+xi*\n\t\t\t(ce[1][m]+xi*\n\t\t\t (ce[4][m]+xi*\n\t\t\t  (ce[7][m]+xi*\n\t\t\t   ce[10][m])))+eta*\n\t\t\t(ce[2][m]+eta*\n\t\t\t (ce[5][m]+eta*\n\t\t\t  (ce[8][m]+eta*\n\t\t\t   ce[11][m])))+zeta*\n\t\t\t(ce[3][m]+zeta*\n\t\t\t (ce[6][m]+zeta*\n\t\t\t  (ce[9][m]+zeta*\n\t\t\t   ce[12][m])));\n\t}\n}\n\n/*\n * ---------------------------------------------------------------------\n * this subroutine initializes the field variable u using \n * tri-linear transfinite interpolation of the boundary values     \n * ---------------------------------------------------------------------\n */\nvoid initialize(){\n\tint i, j, k, m, ix, iy, iz;\n\tdouble xi, eta, zeta, Pface[2][3][5], Pxi, Peta, Pzeta, temp[5];\n\t/*\n\t * ---------------------------------------------------------------------\n\t * later (in compute_rhs) we compute 1/u for every element. a few of \n\t * the corner elements are not used, but it convenient (and faster) \n\t * to compute the whole thing with a simple loop. make sure those \n\t * values are nonzero by initializing the whole thing here. \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\tu[k][j][i][0]=1.0;\n\t\t\t\tu[k][j][i][1]=0.0;\n\t\t\t\tu[k][j][i][2]=0.0;\n\t\t\t\tu[k][j][i][3]=0.0;\n\t\t\t\tu[k][j][i][4]=1.0;\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * first store the \"interpolated\" values everywhere on the grid    \n\t * ---------------------------------------------------------------------\n\t */\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tzeta=(double)k*dnzm1;\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\teta=(double)j*dnym1;\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\txi=(double)i*dnxm1;\n\t\t\t\tfor(ix=0; ix<2; ix++){\n\t\t\t\t\tPxi=(double)ix;\n\t\t\t\t\texact_solution(Pxi, eta, zeta, &Pface[ix][0][0]);\n\t\t\t\t}\n\t\t\t\tfor(iy=0; iy<2; iy++){\n\t\t\t\t\tPeta=(double)iy;\n\t\t\t\t\texact_solution(xi, Peta, zeta, &Pface[iy][1][0]);\n\t\t\t\t}\n\t\t\t\tfor(iz=0; iz<2; iz++){\n\t\t\t\t\tPzeta=(double)iz;\n\t\t\t\t\texact_solution(xi, eta, Pzeta, &Pface[iz][2][0]);\n\t\t\t\t}\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tPxi=xi*Pface[1][0][m]+(1.0-xi)*Pface[0][0][m];\n\t\t\t\t\tPeta=eta*Pface[1][1][m]+(1.0-eta)*Pface[0][1][m];\n\t\t\t\t\tPzeta=zeta*Pface[1][2][m]+(1.0-zeta)*Pface[0][2][m];\n\t\t\t\t\tu[k][j][i][m]=Pxi+Peta+Pzeta- \n\t\t\t\t\t\tPxi*Peta-Pxi*Pzeta-Peta*Pzeta+ \n\t\t\t\t\t\tPxi*Peta*Pzeta;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * now store the exact values on the boundaries        \n\t * ---------------------------------------------------------------------\n\t * west face                                                  \n\t * ---------------------------------------------------------------------\n\t */\n\txi=0.0;\n\ti=0;\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tzeta=(double)k*dnzm1;\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\teta=(double)j*dnym1;\n\t\t\texact_solution(xi, eta, zeta, temp);\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tu[k][j][i][m]=temp[m];\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * east face                                                      \n\t * ---------------------------------------------------------------------\n\t */\n\txi=1.0;\n\ti=grid_points[0]-1;\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tzeta=(double)k*dnzm1;\n\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\teta=(double)j*dnym1;\n\t\t\texact_solution(xi, eta, zeta, temp);\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tu[k][j][i][m]=temp[m];\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * south face                                                 \n\t * ---------------------------------------------------------------------\n\t */\n\teta=0.0;\n\tj=0;\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tzeta=(double)k*dnzm1;\n\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\txi=(double)i*dnxm1;\n\t\t\texact_solution(xi, eta, zeta, temp);\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tu[k][j][i][m]=temp[m];\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * north face                                    \n\t * ---------------------------------------------------------------------\n\t */\n\teta=1.0;\n\tj=grid_points[1]-1;\n\tfor(k=0; k<=grid_points[2]-1; k++){\n\t\tzeta=(double)k*dnzm1;\n\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\txi=(double)i*dnxm1;\n\t\t\texact_solution(xi, eta, zeta, temp);\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tu[k][j][i][m]=temp[m];\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * bottom face                                       \n\t * ---------------------------------------------------------------------\n\t */\n\tzeta=0.0;\n\tk=0;\n\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\teta=(double)j*dnym1;\n\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\txi=(double)i*dnxm1;\n\t\t\texact_solution(xi, eta, zeta, temp);\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tu[k][j][i][m]=temp[m];\n\t\t\t}\n\t\t}\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * top face     \n\t * ---------------------------------------------------------------------\n\t */\n\tzeta=1.0;\n\tk=grid_points[2]-1;\n\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\teta=(double)j*dnym1;\n\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\txi=(double)i*dnxm1;\n\t\t\texact_solution(xi, eta, zeta, temp);\n\t\t\tfor(m=0; m<5; m++){\n\t\t\t\tu[k][j][i][m]=temp[m];\n\t\t\t}\n\t\t}\n\t}\n}\n\nvoid lhsinit(int ni, int nj){\n\tint j, m;\n\t/*\n\t * ---------------------------------------------------------------------\n\t * zap the whole left hand side for starters\n\t * set all diagonal values to 1. This is overkill, but convenient\n\t * ---------------------------------------------------------------------\n\t */\n\tfor(j=1; j<=nj; j++){\n\t\tfor(m=0; m<5; m++){\n\t\t\tlhs[j][0][m]=0.0;\n\t\t\tlhsp[j][0][m]=0.0;\n\t\t\tlhsm[j][0][m]=0.0;\n\t\t\tlhs[j][ni][m]=0.0;\n\t\t\tlhsp[j][ni][m]=0.0;\n\t\t\tlhsm[j][ni][m]=0.0;\n\t\t}\n\t\tlhs[j][0][2]=1.0;\n\t\tlhsp[j][0][2]=1.0;\n\t\tlhsm[j][0][2]=1.0;\n\t\tlhs[j][ni][2]=1.0;\n\t\tlhsp[j][ni][2]=1.0;\n\t\tlhsm[j][ni][2]=1.0;\n\t}\n}\n\nvoid lhsinitj(int nj, int ni){\n\tint i, m;\n\t/*\n\t * ---------------------------------------------------------------------\n\t * zap the whole left hand side for starters\n\t * set all diagonal values to 1. This is overkill, but convenient\n\t * ---------------------------------------------------------------------\n\t */\n\tfor(i=1; i<=ni; i++){\n\t\tfor(m=0; m<5; m++){\n\t\t\tlhs[0][i][m]=0.0;\n\t\t\tlhsp[0][i][m]=0.0;\n\t\t\tlhsm[0][i][m]=0.0;\n\t\t\tlhs[nj][i][m]=0.0;\n\t\t\tlhsp[nj][i][m]=0.0;\n\t\t\tlhsm[nj][i][m]=0.0;\n\t\t}\n\t\tlhs[0][i][2]=1.0;\n\t\tlhsp[0][i][2]=1.0;\n\t\tlhsm[0][i][2]=1.0;\n\t\tlhs[nj][i][2]=1.0;\n\t\tlhsp[nj][i][2]=1.0;\n\t\tlhsm[nj][i][2]=1.0;\n\t}\n}\n\n/*\n * ---------------------------------------------------------------------\n * block-diagonal matrix-vector multiplication              \n * ---------------------------------------------------------------------\n */\nvoid ninvr(){\n\tint i, j, k;\n\tdouble r1, r2, r3, r4, r5, t1, t2;\n\tif(timeron){timer_start(T_NINVR);}\n\tfor(k=1; k<=nz2; k++){\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tr1=rhs[k][j][i][0];\n\t\t\t\tr2=rhs[k][j][i][1];\n\t\t\t\tr3=rhs[k][j][i][2];\n\t\t\t\tr4=rhs[k][j][i][3];\n\t\t\t\tr5=rhs[k][j][i][4];\n\t\t\t\tt1=bt*r3;\n\t\t\t\tt2=0.5*(r4+r5);\n\t\t\t\trhs[k][j][i][0]=-r2;\n\t\t\t\trhs[k][j][i][1]=r1;\n\t\t\t\trhs[k][j][i][2]=bt*(r4-r5);\n\t\t\t\trhs[k][j][i][3]=-t1+t2;\n\t\t\t\trhs[k][j][i][4]=t1+t2;\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_NINVR);}\n}\n\n/*\n * ---------------------------------------------------------------------\n * block-diagonal matrix-vector multiplication                       \n * ---------------------------------------------------------------------\n */\nvoid pinvr(){\n\tint i, j, k;\n\tdouble r1, r2, r3, r4, r5, t1, t2;\n\tif(timeron){timer_start(T_PINVR);}\n\tfor(k=1; k<=nz2; k++){\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tr1=rhs[k][j][i][0];\n\t\t\t\tr2=rhs[k][j][i][1];\n\t\t\t\tr3=rhs[k][j][i][2];\n\t\t\t\tr4=rhs[k][j][i][3];\n\t\t\t\tr5=rhs[k][j][i][4];\n\t\t\t\tt1=bt*r1;\n\t\t\t\tt2=0.5*(r4+r5);\n\t\t\t\trhs[k][j][i][0]=bt*(r4-r5);\n\t\t\t\trhs[k][j][i][1]=-r3;\n\t\t\t\trhs[k][j][i][2]=r2;\n\t\t\t\trhs[k][j][i][3]=-t1+t2;\n\t\t\t\trhs[k][j][i][4]=t1+t2;\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_PINVR);}\n}\n\nvoid rhs_norm(double rms[]){\n\tint i, j, k, d, m;\n\tdouble add;\n\tfor(m=0;m<5;m++){rms[m]=0.0;}\n\tfor(k=1; k<=nz2; k++){\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tfor(m=0; m<5; m++){\n\t\t\t\t\tadd=rhs[k][j][i][m];\n\t\t\t\t\trms[m]=rms[m]+add*add;\n\t\t\t\t} \n\t\t\t} \n\t\t} \n\t} \n\tfor(m=0; m<5; m++){\n\t\tfor(d=0; d<3; d++){\n\t\t\trms[m]=rms[m]/(double)(grid_points[d]-2);\n\t\t}\n\t\trms[m]=sqrt(rms[m]);\n\t}\n}\n\nvoid set_constants(){\n\tce[0][0]=2.0;\n\tce[1][0]=0.0;\n\tce[2][0]=0.0;\n\tce[3][0]=4.0;\n\tce[4][0]=5.0;\n\tce[5][0]=3.0;\n\tce[6][0]=0.5;\n\tce[7][0]=0.02;\n\tce[8][0]=0.01;\n\tce[9][0]=0.03;\n\tce[10][0]=0.5;\n\tce[11][0]=0.4;\n\tce[12][0]=0.3;\n\t/* */\n\tce[0][1]=1.0;\n\tce[1][1]=0.0;\n\tce[2][1]=0.0;\n\tce[3][1]=0.0;\n\tce[4][1]=1.0;\n\tce[5][1]=2.0;\n\tce[6][1]=3.0;\n\tce[7][1]=0.01;\n\tce[8][1]=0.03;\n\tce[9][1]=0.02;\n\tce[10][1]=0.4;\n\tce[11][1]=0.3;\n\tce[12][1]=0.5;\n\t/* */\n\tce[0][2]=2.0;\n\tce[1][2]=2.0;\n\tce[2][2]=0.0;\n\tce[3][2]=0.0;\n\tce[4][2]=0.0;\n\tce[5][2]=2.0;\n\tce[6][2]=3.0;\n\tce[7][2]=0.04;\n\tce[8][2]=0.03;\n\tce[9][2]=0.05;\n\tce[10][2]=0.3;\n\tce[11][2]=0.5;\n\tce[12][2]=0.4;\n\t/* */\n\tce[0][3]=2.0;\n\tce[1][3]=2.0;\n\tce[2][3]=0.0;\n\tce[3][3]=0.0;\n\tce[4][3]=0.0;\n\tce[5][3]=2.0;\n\tce[6][3]=3.0;\n\tce[7][3]=0.03;\n\tce[8][3]=0.05;\n\tce[9][3]=0.04;\n\tce[10][3]=0.2;\n\tce[11][3]=0.1;\n\tce[12][3]=0.3;\n\t/* */\n\tce[0][4]=5.0;\n\tce[1][4]=4.0;\n\tce[2][4]=3.0;\n\tce[3][4]=2.0;\n\tce[4][4]=0.1;\n\tce[5][4]=0.4;\n\tce[6][4]=0.3;\n\tce[7][4]=0.05;\n\tce[8][4]=0.04;\n\tce[9][4]=0.03;\n\tce[10][4]=0.1;\n\tce[11][4]=0.3;\n\tce[12][4]=0.2;\n\t/* */\n\tc1=1.4;\n\tc2=0.4;\n\tc3=0.1;\n\tc4=1.0;\n\tc5=1.4;\n\t/* */\n\tbt=sqrt(0.5);\n\t/* */\n\tdnxm1=1.0/(double)(grid_points[0]-1);\n\tdnym1=1.0/(double)(grid_points[1]-1);\n\tdnzm1=1.0/(double)(grid_points[2]-1);\n\t/* */\n\tc1c2=c1*c2;\n\tc1c5=c1*c5;\n\tc3c4=c3*c4;\n\tc1345=c1c5*c3c4;\n\t/* */\n\tconz1=(1.0-c1c5);\n\t/* */\n\ttx1=1.0/(dnxm1*dnxm1);\n\ttx2=1.0/(2.0*dnxm1);\n\ttx3=1.0/dnxm1;\n\t/* */\n\tty1=1.0/(dnym1*dnym1);\n\tty2=1.0/(2.0*dnym1);\n\tty3=1.0/dnym1;\n\t/* */\n\ttz1=1.0/(dnzm1*dnzm1);\n\ttz2=1.0/(2.0*dnzm1);\n\ttz3=1.0/dnzm1;\n\t/* */\n\tdx1=0.75;\n\tdx2=0.75;\n\tdx3=0.75;\n\tdx4=0.75;\n\tdx5=0.75;\n\t/* */\n\tdy1=0.75;\n\tdy2=0.75;\n\tdy3=0.75;\n\tdy4=0.75;\n\tdy5=0.75;\n\t/* */\n\tdz1=1.0;\n\tdz2=1.0;\n\tdz3=1.0;\n\tdz4=1.0;\n\tdz5=1.0;\n\t/* */\n\tdxmax=max(dx3, dx4);\n\tdymax=max(dy2, dy4);\n\tdzmax=max(dz2, dz3);\n\t/* */\n\tdssp=0.25*max(dx1, max(dy1, dz1));\n\t/* */\n\tc4dssp=4.0*dssp;\n\tc5dssp=5.0*dssp;\n\t/* */\n\tdttx1=dt*tx1;\n\tdttx2=dt*tx2;\n\tdtty1=dt*ty1;\n\tdtty2=dt*ty2;\n\tdttz1=dt*tz1;\n\tdttz2=dt*tz2;\n\t/* */\n\tc2dttx1=2.0*dttx1;\n\tc2dtty1=2.0*dtty1;\n\tc2dttz1=2.0*dttz1;\n\t/* */\n\tdtdssp=dt*dssp;\n\t/* */\n\tcomz1=dtdssp;\n\tcomz4=4.0*dtdssp;\n\tcomz5=5.0*dtdssp;\n\tcomz6=6.0*dtdssp;\n\t/* */\n\tc3c4tx3=c3c4*tx3;\n\tc3c4ty3=c3c4*ty3;\n\tc3c4tz3=c3c4*tz3;\n\t/* */\n\tdx1tx1=dx1*tx1;\n\tdx2tx1=dx2*tx1;\n\tdx3tx1=dx3*tx1;\n\tdx4tx1=dx4*tx1;\n\tdx5tx1=dx5*tx1;\n\t/* */\n\tdy1ty1=dy1*ty1;\n\tdy2ty1=dy2*ty1;\n\tdy3ty1=dy3*ty1;\n\tdy4ty1=dy4*ty1;\n\tdy5ty1=dy5*ty1;\n\t/* */\n\tdz1tz1=dz1*tz1;\n\tdz2tz1=dz2*tz1;\n\tdz3tz1=dz3*tz1;\n\tdz4tz1=dz4*tz1;\n\tdz5tz1=dz5*tz1;\n\t/* */\n\tc2iv=2.5;\n\tcon43=4.0/3.0;\n\tcon16=1.0/6.0;\n\t/* */\n\txxcon1=c3c4tx3*con43*tx3;\n\txxcon2=c3c4tx3*tx3;\n\txxcon3=c3c4tx3*conz1*tx3;\n\txxcon4=c3c4tx3*con16*tx3;\n\txxcon5=c3c4tx3*c1c5*tx3;\n\t/* */\n\tyycon1=c3c4ty3*con43*ty3;\n\tyycon2=c3c4ty3*ty3;\n\tyycon3=c3c4ty3*conz1*ty3;\n\tyycon4=c3c4ty3*con16*ty3;\n\tyycon5=c3c4ty3*c1c5*ty3;\n\t/* */\n\tzzcon1=c3c4tz3*con43*tz3;\n\tzzcon2=c3c4tz3*tz3;\n\tzzcon3=c3c4tz3*conz1*tz3;\n\tzzcon4=c3c4tz3*con16*tz3;\n\tzzcon5=c3c4tz3*c1c5*tz3;\n}\n\n/*\n * ---------------------------------------------------------------------\n * block-diagonal matrix-vector multiplication                  \n * ---------------------------------------------------------------------\n */\nvoid txinvr(){\n\tint i, j, k;\n\tdouble t1, t2, t3, ac, ru1, uu, vv, ww, r1, r2, r3, r4, r5, ac2inv;\n\tif(timeron){timer_start(T_TXINVR);}\n\tfor(k=1; k<=nz2; k++){\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tru1=rho_i[k][j][i];\n\t\t\t\tuu=us[k][j][i];\n\t\t\t\tvv=vs[k][j][i];\n\t\t\t\tww=ws[k][j][i];\n\t\t\t\tac=speed[k][j][i];\n\t\t\t\tac2inv=ac*ac;\n\t\t\t\tr1=rhs[k][j][i][0];\n\t\t\t\tr2=rhs[k][j][i][1];\n\t\t\t\tr3=rhs[k][j][i][2];\n\t\t\t\tr4=rhs[k][j][i][3];\n\t\t\t\tr5=rhs[k][j][i][4];\n\t\t\t\tt1=c2/ac2inv*(qs[k][j][i]*r1-uu*r2-vv*r3-ww*r4+r5);\n\t\t\t\tt2=bt*ru1*(uu*r1-r2);\n\t\t\t\tt3=(bt*ru1*ac)*t1;\n\t\t\t\trhs[k][j][i][0]=r1-t1;\n\t\t\t\trhs[k][j][i][1]=-ru1*(ww*r1-r4);\n\t\t\t\trhs[k][j][i][2]=ru1*(vv*r1-r3);\n\t\t\t\trhs[k][j][i][3]=-t2+t3;\n\t\t\t\trhs[k][j][i][4]=t2+t3;\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_TXINVR);}\n}\n\n/*\n * ---------------------------------------------------------------------\n * block-diagonal matrix-vector multiplication                       \n * ---------------------------------------------------------------------\n */\nvoid tzetar(){\n\tint i, j, k;\n\tdouble t1, t2, t3, ac, xvel, yvel, zvel, r1, r2, r3, r4, r5, btuz, ac2u, uzik1;\n\tif(timeron){timer_start(T_TZETAR);}\n\tfor(k=1; k<=nz2; k++){\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\txvel=us[k][j][i];\n\t\t\t\tyvel=vs[k][j][i];\n\t\t\t\tzvel=ws[k][j][i];\n\t\t\t\tac=speed[k][j][i];\n\t\t\t\tac2u=ac*ac;\n\t\t\t\tr1=rhs[k][j][i][0];\n\t\t\t\tr2=rhs[k][j][i][1];\n\t\t\t\tr3=rhs[k][j][i][2];\n\t\t\t\tr4=rhs[k][j][i][3];\n\t\t\t\tr5=rhs[k][j][i][4];\n\t\t\t\tuzik1=u[k][j][i][0];\n\t\t\t\tbtuz=bt*uzik1;\n\t\t\t\tt1=btuz/ac*(r4+r5);\n\t\t\t\tt2=r3+t1;\n\t\t\t\tt3=btuz*(r4-r5);\n\t\t\t\trhs[k][j][i][0]=t2;\n\t\t\t\trhs[k][j][i][1]=-uzik1*r2+xvel*t2;\n\t\t\t\trhs[k][j][i][2]=uzik1*r1+yvel*t2;\n\t\t\t\trhs[k][j][i][3]=zvel*t2+t3;\n\t\t\t\trhs[k][j][i][4]=uzik1*(-xvel*r2+yvel*r1) + \n\t\t\t\t\tqs[k][j][i]*t2+c2iv*ac2u*t1+zvel*t3;\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_TZETAR);}\n}\n\n/*\n * ---------------------------------------------------------------------\n * verification routine                         \n * ---------------------------------------------------------------------\n */\nvoid verify(int no_time_steps, char* class_npb, boolean* verified){\n\tdouble xcrref[5], xceref[5], xcrdif[5], xcedif[5], epsilon, xce[5], xcr[5], dtref;\n\tint m;\n\t/*\n\t * ---------------------------------------------------------------------\n\t * tolerance level\n\t * ---------------------------------------------------------------------\n\t */\n\tepsilon=1.0e-08;\n\t/*\n\t * ---------------------------------------------------------------------\n\t * compute the error norm and the residual norm, and exit if not printing\n\t * ---------------------------------------------------------------------\n\t */\n\terror_norm(xce);\n\tcompute_rhs();\n\trhs_norm(xcr);\n\tfor(m=0;m<5;m++){xcr[m]=xcr[m]/dt;}\n\t*class_npb='U';\n\t*verified=TRUE;\n\tfor(m=0;m<5;m++){xcrref[m]=1.0;xceref[m]=1.0;}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * reference data for 12X12X12 grids after 100 time steps, with DT = 1.50d-02\n\t * ---------------------------------------------------------------------\n\t */\n\tif((grid_points[0]==12)&&(grid_points[1]==12)&&(grid_points[2]==12)&&(no_time_steps==100)){\n\t\t*class_npb='S';\n\t\tdtref=1.5e-2;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of residual\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txcrref[0]=2.7470315451339479e-02;\n\t\txcrref[1]=1.0360746705285417e-02;\n\t\txcrref[2]=1.6235745065095532e-02;\n\t\txcrref[3]=1.5840557224455615e-02;\n\t\txcrref[4]=3.4849040609362460e-02;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of solution error\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txceref[0]=2.7289258557377227e-05;\n\t\txceref[1]=1.0364446640837285e-05;\n\t\txceref[2]=1.6154798287166471e-05;\n\t\txceref[3]=1.5750704994480102e-05;\n\t\txceref[4]=3.4177666183390531e-05;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference data for 36X36X36 grids after 400 time steps, with DT = 1.5d-03\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t}else if((grid_points[0]==36)&&(grid_points[1]==36)&&(grid_points[2]==36)&&(no_time_steps==400)){\n\t\t*class_npb='W';\n\t\tdtref=1.5e-3;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of residual\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txcrref[0]=0.1893253733584e-02;\n\t\txcrref[1]=0.1717075447775e-03;\n\t\txcrref[2]=0.2778153350936e-03;\n\t\txcrref[3]=0.2887475409984e-03;\n\t\txcrref[4]=0.3143611161242e-02;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of solution error\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txceref[0]=0.7542088599534e-04;\n\t\txceref[1]=0.6512852253086e-05;\n\t\txceref[2]=0.1049092285688e-04;\n\t\txceref[3]=0.1128838671535e-04;\n\t\txceref[4]=0.1212845639773e-03;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference data for 64X64X64 grids after 400 time steps, with DT = 1.5d-03\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t}else if((grid_points[0]==64)&&(grid_points[1]==64)&&(grid_points[2]==64)&&(no_time_steps==400)){\n\t\t*class_npb='A';\n\t\tdtref=1.5e-3;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of residual.\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txcrref[0]=2.4799822399300195;\n\t\txcrref[1]=1.1276337964368832;\n\t\txcrref[2]=1.5028977888770491;\n\t\txcrref[3]=1.4217816211695179;\n\t\txcrref[4]=2.1292113035138280;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of solution error.\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txceref[0]=1.0900140297820550e-04;\n\t\txceref[1]=3.7343951769282091e-05;\n\t\txceref[2]=5.0092785406541633e-05;\n\t\txceref[3]=4.7671093939528255e-05;\n\t\txceref[4]=1.3621613399213001e-04;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference data for 102X102X102 grids after 400 time steps,\n\t\t * with DT = 1.0d-03\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t}else if((grid_points[0]==102)&&(grid_points[1]==102)&&(grid_points[2]==102)&&(no_time_steps==400)){\n\t\t*class_npb='B';\n\t\tdtref=1.0e-3;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of residual\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txcrref[0]=0.6903293579998e+02;\n\t\txcrref[1]=0.3095134488084e+02;\n\t\txcrref[2]=0.4103336647017e+02;\n\t\txcrref[3]=0.3864769009604e+02;\n\t\txcrref[4]=0.5643482272596e+02;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of solution error\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txceref[0]=0.9810006190188e-02;\n\t\txceref[1]=0.1022827905670e-02;\n\t\txceref[2]=0.1720597911692e-02;\n\t\txceref[3]=0.1694479428231e-02;\n\t\txceref[4]=0.1847456263981e-01;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference data for 162X162X162 grids after 400 time steps,\n\t\t * with DT = 0.67d-03\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t}else if((grid_points[0]==162)&&(grid_points[1]==162)&&(grid_points[2]==162)&&(no_time_steps==400)){\n\t\t*class_npb='C';\n\t\tdtref=0.67e-3;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of residual\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txcrref[0]=0.5881691581829e+03;\n\t\txcrref[1]=0.2454417603569e+03;\n\t\txcrref[2]=0.3293829191851e+03;\n\t\txcrref[3]=0.3081924971891e+03;\n\t\txcrref[4]=0.4597223799176e+03;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of solution error\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txceref[0]=0.2598120500183e+00;\n\t\txceref[1]=0.2590888922315e-01;\n\t\txceref[2]=0.5132886416320e-01;\n\t\txceref[3]=0.4806073419454e-01;\n\t\txceref[4]=0.5483377491301e+00;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference data for 408X408X408 grids after 500 time steps,\n\t\t * with DT = 0.3d-03\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t}else if((grid_points[0]==408)&&(grid_points[1]==408)&&(grid_points[2]==408)&&(no_time_steps==500)){\n\t\t*class_npb='D';\n\t\tdtref=0.30e-3;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of residual\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txcrref[0]=0.1044696216887e+05;\n\t\txcrref[1]=0.3204427762578e+04;\n\t\txcrref[2]=0.4648680733032e+04;\n\t\txcrref[3]=0.4238923283697e+04;\n\t\txcrref[4]=0.7588412036136e+04;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of solution error\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txceref[0]=0.5089471423669e+01;\n\t\txceref[1]=0.5323514855894e+00;\n\t\txceref[2]=0.1187051008971e+01;\n\t\txceref[3]=0.1083734951938e+01;\n\t\txceref[4]=0.1164108338568e+02;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference data for 1020X1020X1020 grids after 500 time steps,\n\t\t * with DT = 0.1d-03\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t}else if((grid_points[0]==1020)&&(grid_points[1]==1020)&&(grid_points[2]==1020)&&(no_time_steps==500)){\n\t\t*class_npb='E';\n\t\tdtref=0.10e-3;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of residual\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txcrref[0]=0.6255387422609e+05;\n\t\txcrref[1]=0.1495317020012e+05;\n\t\txcrref[2]=0.2347595750586e+05;\n\t\txcrref[3]=0.2091099783534e+05;\n\t\txcrref[4]=0.4770412841218e+05;\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * reference values of RMS-norms of solution error\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\txceref[0]=0.6742735164909e+02;\n\t\txceref[1]=0.5390656036938e+01;\n\t\txceref[2]=0.1680647196477e+02;\n\t\txceref[3]=0.1536963126457e+02;\n\t\txceref[4]=0.1575330146156e+03;\n\t}else{\n\t\t*verified=FALSE;\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * verification test for residuals if gridsize is one of \n\t * the defined grid sizes above (class .ne. 'U')\n\t * ---------------------------------------------------------------------\n\t * compute the difference of solution values and the known reference values\n\t * ---------------------------------------------------------------------\n\t */\n\tfor(m=0; m<5; m++){\n\t\txcrdif[m]=fabs((xcr[m]-xcrref[m])/xcrref[m]);\n\t\txcedif[m]=fabs((xce[m]-xceref[m])/xceref[m]);\n\t}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * output the comparison of computed results to known cases\n\t * ---------------------------------------------------------------------\n\t */\n\tif(*class_npb!='U'){\n\t\tprintf(\" Verification being performed for class %c\\n\",*class_npb);\n\t\tprintf(\" accuracy setting for epsilon = %20.13E\\n\",epsilon);\n\t\t*verified=(fabs(dt-dtref)<=epsilon);\n\t\tif(!(*verified)){  \n\t\t\t*class_npb='U';\n\t\t\tprintf(\" DT does not match the reference value of %15.8E\\n\",dtref);\n\t\t} \n\t}else{\n\t\tprintf(\" Unknown class\\n\");\n\t}\n\tif(*class_npb!='U'){\n\t\tprintf(\" Comparison of RMS-norms of residual\\n\");\n\t}else{\n\t\tprintf(\" RMS-norms of residual\\n\");\n\t}\n\tfor(m=0;m<5;m++){\n\t\tif(*class_npb=='U'){\n\t\t\tprintf(\"          %2d%20.13E\\n\",m+1,xcr[m]);\n\t\t}else if(xcrdif[m]<=epsilon){\n\t\t\tprintf(\"          %2d%20.13E%20.13E%20.13E\\n\",m+1,xcr[m],xcrref[m],xcrdif[m]);\n\t\t}else {\n\t\t\t*verified=FALSE;\n\t\t\tprintf(\" FAILURE: %2d%20.13E%20.13E%20.13E\\n\",m+1,xcr[m],xcrref[m],xcrdif[m]);\n\t\t}\n\t}\n\tif(*class_npb!='U'){\n\t\tprintf(\" Comparison of RMS-norms of solution error\\n\");\n\t}else{\n\t\tprintf(\" RMS-norms of solution error\\n\");\n\t}\n\tfor(m=0;m<5;m++){\n\t\tif(*class_npb=='U'){\n\t\t\tprintf(\"          %2d%20.13E\\n\",m+1,xce[m]);\n\t\t}else if(xcedif[m]<=epsilon){\n\t\t\tprintf(\"          %2d%20.13E%20.13E%20.13E\\n\",m+1,xce[m],xceref[m],xcedif[m]);\n\t\t}else{\n\t\t\t*verified = FALSE;\n\t\t\tprintf(\" FAILURE: %2d%20.13E%20.13E%20.13E\\n\",m+1,xce[m],xceref[m],xcedif[m]);\n\t\t}\n\t}\n\tif(*class_npb=='U'){\n\t\tprintf(\" No reference values provided\\n\");\n\t\tprintf(\" No verification performed\\n\");\n\t}else if(*verified){\n\t\tprintf(\" Verification Successful\\n\");\n\t}else{\n\t\tprintf(\" Verification failed\\n\");\n\t}\n}\n\n/*\n * ---------------------------------------------------------------------\n * this function performs the solution of the approximate factorization\n * step in the x-direction for all five matrix components\n * simultaneously. the thomas algorithm is employed to solve the\n * systems for the x-lines. boundary conditions are non-periodic\n * ---------------------------------------------------------------------\n */\nvoid x_solve(){\n\tint i, j, k, i1, i2, m;\n\tdouble ru1, fac1, fac2;\n\tif(timeron){timer_start(T_XSOLVE);}\n\tfor(k=1; k<=nz2; k++){\n\t\tlhsinit(nx2+1, ny2);\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * computes the left hand side for the three x-factors  \n\t\t * ---------------------------------------------------------------------\n\t\t * first fill the lhs for the u-eigenvalue                   \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=0; i<=grid_points[0]-1; i++){\n\t\t\t\tru1=c3c4*rho_i[k][j][i];\n\t\t\t\tcv[i]=us[k][j][i];\n\t\t\t\trhon[i]=max(max(dx2+con43*ru1,dx5+c1c5*ru1), max(dxmax+ru1,dx1));\n\t\t\t}\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tlhs[j][i][0]=0.0;\n\t\t\t\tlhs[j][i][1]=-dttx2*cv[i-1]-dttx1*rhon[i-1];\n\t\t\t\tlhs[j][i][2]=1.0+c2dttx1*rhon[i];\n\t\t\t\tlhs[j][i][3]=dttx2*cv[i+1]-dttx1*rhon[i+1];\n\t\t\t\tlhs[j][i][4]=0.0;\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * add fourth order dissipation                             \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\ti=1;\n\t\t\tlhs[j][i][2]=lhs[j][i][2]+comz5;\n\t\t\tlhs[j][i][3]=lhs[j][i][3]-comz4;\n\t\t\tlhs[j][i][4]=lhs[j][i][4]+comz1;\n\t\t\tlhs[j][i+1][1]=lhs[j][i+1][1]-comz4;\n\t\t\tlhs[j][i+1][2]=lhs[j][i+1][2]+comz6;\n\t\t\tlhs[j][i+1][3]=lhs[j][i+1][3]-comz4;\n\t\t\tlhs[j][i+1][4]=lhs[j][i+1][4]+comz1;\n\t\t}\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=3; i<=grid_points[0]-4; i++){\n\t\t\t\tlhs[j][i][0]=lhs[j][i][0]+comz1;\n\t\t\t\tlhs[j][i][1]=lhs[j][i][1]-comz4;\n\t\t\t\tlhs[j][i][2]=lhs[j][i][2]+comz6;\n\t\t\t\tlhs[j][i][3]=lhs[j][i][3]-comz4;\n\t\t\t\tlhs[j][i][4]=lhs[j][i][4]+comz1;\n\t\t\t}\n\t\t}\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\ti=grid_points[0]-3;\n\t\t\tlhs[j][i][0]=lhs[j][i][0]+comz1;\n\t\t\tlhs[j][i][1]=lhs[j][i][1]-comz4;\n\t\t\tlhs[j][i][2]=lhs[j][i][2]+comz6;\n\t\t\tlhs[j][i][3]=lhs[j][i][3]-comz4;\n\t\t\tlhs[j][i+1][0]=lhs[j][i+1][0]+comz1;\n\t\t\tlhs[j][i+1][1]=lhs[j][i+1][1]-comz4;\n\t\t\tlhs[j][i+1][2]=lhs[j][i+1][2]+comz5;\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * subsequently, fill the other factors (u+c), (u-c) by adding to \n\t\t * the first  \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tlhsp[j][i][0]=lhs[j][i][0];\n\t\t\t\tlhsp[j][i][1]=lhs[j][i][1]-dttx2*speed[k][j][i-1];\n\t\t\t\tlhsp[j][i][2]=lhs[j][i][2];\n\t\t\t\tlhsp[j][i][3]=lhs[j][i][3]+dttx2*speed[k][j][i+1];\n\t\t\t\tlhsp[j][i][4]=lhs[j][i][4];\n\t\t\t\tlhsm[j][i][0]=lhs[j][i][0];\n\t\t\t\tlhsm[j][i][1]=lhs[j][i][1]+dttx2*speed[k][j][i-1];\n\t\t\t\tlhsm[j][i][2]=lhs[j][i][2];\n\t\t\t\tlhsm[j][i][3]=lhs[j][i][3]-dttx2*speed[k][j][i+1];\n\t\t\t\tlhsm[j][i][4]=lhs[j][i][4];\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * FORWARD ELIMINATION  \n\t\t * ---------------------------------------------------------------------\n\t\t * perform the thomas algorithm; first, FORWARD ELIMINATION     \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=0; i<=grid_points[0]-3; i++){\n\t\t\t\ti1=i+1;\n\t\t\t\ti2=i+2;\n\t\t\t\tfac1=1.0/lhs[j][i][2];\n\t\t\t\tlhs[j][i][3]=fac1*lhs[j][i][3];\n\t\t\t\tlhs[j][i][4]=fac1*lhs[j][i][4];\n\t\t\t\tfor(m=0; m<3; m++){\n\t\t\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\t\t}\n\t\t\t\tlhs[j][i1][2]=lhs[j][i1][2]-lhs[j][i1][1]*lhs[j][i][3];\n\t\t\t\tlhs[j][i1][3]=lhs[j][i1][3]-lhs[j][i1][1]*lhs[j][i][4];\n\t\t\t\tfor(m=0; m<3; m++){\n\t\t\t\t\trhs[k][j][i1][m]=rhs[k][j][i1][m]-lhs[j][i1][1]*rhs[k][j][i][m];\n\t\t\t\t}\n\t\t\t\tlhs[j][i2][1]=lhs[j][i2][1]-lhs[j][i2][0]*lhs[j][i][3];\n\t\t\t\tlhs[j][i2][2]=lhs[j][i2][2]-lhs[j][i2][0]*lhs[j][i][4];\n\t\t\t\tfor(m=0; m<3; m++){\n\t\t\t\t\trhs[k][j][i2][m]=rhs[k][j][i2][m]-lhs[j][i2][0]*rhs[k][j][i][m];\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * the last two rows in this grid block are a bit different, \n\t\t * since they do not have two more rows available for the\n\t\t * elimination of off-diagonal entries\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\ti=grid_points[0]-2;\n\t\t\ti1=grid_points[0]-1;\n\t\t\tfac1=1.0/lhs[j][i][2];\n\t\t\tlhs[j][i][3]=fac1*lhs[j][i][3];\n\t\t\tlhs[j][i][4]=fac1*lhs[j][i][4];\n\t\t\tfor(m=0; m<3; m++){\n\t\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\t}\n\t\t\tlhs[j][i1][2]=lhs[j][i1][2]-lhs[j][i1][1]*lhs[j][i][3];\n\t\t\tlhs[j][i1][3]=lhs[j][i1][3]-lhs[j][i1][1]*lhs[j][i][4];\n\t\t\tfor(m=0; m<3; m++){\n\t\t\t\trhs[k][j][i1][m]=rhs[k][j][i1][m]-lhs[j][i1][1]*rhs[k][j][i][m];\n\t\t\t}\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * scale the last row immediately \n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfac2 = 1.0/lhs[j][i1][2];\n\t\t\tfor(m=0; m<3; m++){\n\t\t\t\trhs[k][j][i1][m]=fac2*rhs[k][j][i1][m];\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * do the u+c and the u-c factors                 \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=0; i<=grid_points[0]-3; i++){\n\t\t\t\ti1=i+1;\n\t\t\t\ti2=i+2;\n\t\t\t\tm=3;\n\t\t\t\tfac1=1.0/lhsp[j][i][2];\n\t\t\t\tlhsp[j][i][3]=fac1*lhsp[j][i][3];\n\t\t\t\tlhsp[j][i][4]=fac1*lhsp[j][i][4];\n\t\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\t\tlhsp[j][i1][2]=lhsp[j][i1][2]-lhsp[j][i1][1]*lhsp[j][i][3];\n\t\t\t\tlhsp[j][i1][3]=lhsp[j][i1][3]-lhsp[j][i1][1]*lhsp[j][i][4];\n\t\t\t\trhs[k][j][i1][m]=rhs[k][j][i1][m]-lhsp[j][i1][1]*rhs[k][j][i][m];\n\t\t\t\tlhsp[j][i2][1]=lhsp[j][i2][1]-lhsp[j][i2][0]*lhsp[j][i][3];\n\t\t\t\tlhsp[j][i2][2]=lhsp[j][i2][2]-lhsp[j][i2][0]*lhsp[j][i][4];\n\t\t\t\trhs[k][j][i2][m]=rhs[k][j][i2][m]-lhsp[j][i2][0]*rhs[k][j][i][m];\n\t\t\t\tm=4;\n\t\t\t\tfac1=1.0/lhsm[j][i][2];\n\t\t\t\tlhsm[j][i][3]=fac1*lhsm[j][i][3];\n\t\t\t\tlhsm[j][i][4]=fac1*lhsm[j][i][4];\n\t\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\t\tlhsm[j][i1][2]=lhsm[j][i1][2]-lhsm[j][i1][1]*lhsm[j][i][3];\n\t\t\t\tlhsm[j][i1][3]=lhsm[j][i1][3]-lhsm[j][i1][1]*lhsm[j][i][4];\n\t\t\t\trhs[k][j][i1][m]=rhs[k][j][i1][m]-lhsm[j][i1][1]*rhs[k][j][i][m];\n\t\t\t\tlhsm[j][i2][1]=lhsm[j][i2][1]-lhsm[j][i2][0]*lhsm[j][i][3];\n\t\t\t\tlhsm[j][i2][2]=lhsm[j][i2][2]-lhsm[j][i2][0]*lhsm[j][i][4];\n\t\t\t\trhs[k][j][i2][m]=rhs[k][j][i2][m]-lhsm[j][i2][0]*rhs[k][j][i][m];\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * and again the last two rows separately\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\ti=grid_points[0]-2;\n\t\t\ti1=grid_points[0]-1;\n\t\t\tm=3;\n\t\t\tfac1=1.0/lhsp[j][i][2];\n\t\t\tlhsp[j][i][3]=fac1*lhsp[j][i][3];\n\t\t\tlhsp[j][i][4]=fac1*lhsp[j][i][4];\n\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\tlhsp[j][i1][2]=lhsp[j][i1][2]-lhsp[j][i1][1]*lhsp[j][i][3];\n\t\t\tlhsp[j][i1][3]=lhsp[j][i1][3]-lhsp[j][i1][1]*lhsp[j][i][4];\n\t\t\trhs[k][j][i1][m]=rhs[k][j][i1][m]-lhsp[j][i1][1]*rhs[k][j][i][m];\n\t\t\tm=4;\n\t\t\tfac1=1.0/lhsm[j][i][2];\n\t\t\tlhsm[j][i][3]=fac1*lhsm[j][i][3];\n\t\t\tlhsm[j][i][4]=fac1*lhsm[j][i][4];\n\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\tlhsm[j][i1][2]=lhsm[j][i1][2]-lhsm[j][i1][1]*lhsm[j][i][3];\n\t\t\tlhsm[j][i1][3]=lhsm[j][i1][3]-lhsm[j][i1][1]*lhsm[j][i][4];\n\t\t\trhs[k][j][i1][m]=rhs[k][j][i1][m]-lhsm[j][i1][1]*rhs[k][j][i][m];\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * scale the last row immediately\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\trhs[k][j][i1][3]=rhs[k][j][i1][3]/lhsp[j][i1][2];\n\t\t\trhs[k][j][i1][4]=rhs[k][j][i1][4]/lhsm[j][i1][2];\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * BACKSUBSTITUTION \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\ti=grid_points[0]-2;\n\t\t\ti1=grid_points[0]-1;\n\t\t\tfor(m=0; m<3; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-lhs[j][i][3]*rhs[k][j][i1][m];\n\t\t\t}\n\t\t\trhs[k][j][i][3]=rhs[k][j][i][3]-lhsp[j][i][3]*rhs[k][j][i1][3];\n\t\t\trhs[k][j][i][4]=rhs[k][j][i][4]-lhsm[j][i][3]*rhs[k][j][i1][4];\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * the first three factors\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=1; j<=ny2; j++){\n\t\t\tfor(i=grid_points[0]-3; i>=0; i--){\n\t\t\t\ti1=i+1;\n\t\t\t\ti2=i+2;\n\t\t\t\tfor(m=0; m<3; m++){\n\t\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]- \n\t\t\t\t\t\tlhs[j][i][3]*rhs[k][j][i1][m]-\n\t\t\t\t\t\tlhs[j][i][4]*rhs[k][j][i2][m];\n\t\t\t\t}\n\t\t\t\t/*\n\t\t\t\t * ---------------------------------------------------------------------\n\t\t\t\t * and the remaining two\n\t\t\t\t * ---------------------------------------------------------------------\n\t\t\t\t */\n\t\t\t\trhs[k][j][i][3]=rhs[k][j][i][3]- \n\t\t\t\t\tlhsp[j][i][3]*rhs[k][j][i1][3] -\n\t\t\t\t\tlhsp[j][i][4]*rhs[k][j][i2][3];\n\t\t\t\trhs[k][j][i][4]=rhs[k][j][i][4]- \n\t\t\t\t\tlhsm[j][i][3]*rhs[k][j][i1][4]-\n\t\t\t\t\tlhsm[j][i][4]*rhs[k][j][i2][4];\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_XSOLVE);}\n\t/*\n\t * ---------------------------------------------------------------------\n\t * do the block-diagonal inversion          \n\t * ---------------------------------------------------------------------\n\t */\n\tninvr();\n}\n\n/*\n * ---------------------------------------------------------------------\n * this function performs the solution of the approximate factorization\n * step in the y-direction for all five matrix components\n * simultaneously. the thomas algorithm is employed to solve the\n * systems for the y-lines. boundary conditions are non-periodic\n * ---------------------------------------------------------------------\n */\nvoid y_solve(){\n\tint i, j, k, j1, j2, m;\n\tdouble ru1, fac1, fac2;\n\tif(timeron){timer_start(T_YSOLVE);}\n\tfor(k=1; k<=grid_points[2]-2; k++){\n\t\tlhsinitj(ny2+1, nx2);\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * computes the left hand side for the three y-factors   \n\t\t * ---------------------------------------------------------------------\n\t\t * first fill the lhs for the u-eigenvalue         \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tfor(j=0; j<=grid_points[1]-1; j++){\n\t\t\t\tru1=c3c4*rho_i[k][j][i];\n\t\t\t\tcv[j]=vs[k][j][i];\n\t\t\t\trhoq[j]=max(max(dy3+con43*ru1, dy5+c1c5*ru1), max(dymax+ru1, dy1));\n\t\t\t}\n\t\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\t\tlhs[j][i][0]=0.0;\n\t\t\t\tlhs[j][i][1]=-dtty2*cv[j-1]-dtty1*rhoq[j-1];\n\t\t\t\tlhs[j][i][2]=1.0+c2dtty1*rhoq[j];\n\t\t\t\tlhs[j][i][3]=dtty2*cv[j+1]-dtty1*rhoq[j+1];\n\t\t\t\tlhs[j][i][4]=0.0;\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * add fourth order dissipation                             \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tj=1;\n\t\t\tlhs[j][i][2]=lhs[j][i][2]+comz5;\n\t\t\tlhs[j][i][3]=lhs[j][i][3]-comz4;\n\t\t\tlhs[j][i][4]=lhs[j][i][4]+comz1;\n\t\t\tlhs[j+1][i][1]=lhs[j+1][i][1]-comz4;\n\t\t\tlhs[j+1][i][2]=lhs[j+1][i][2]+comz6;\n\t\t\tlhs[j+1][i][3]=lhs[j+1][i][3]-comz4;\n\t\t\tlhs[j+1][i][4]=lhs[j+1][i][4]+comz1;\n\t\t}\n\t\tfor(j=3; j<=grid_points[1]-4; j++){\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tlhs[j][i][0]=lhs[j][i][0]+comz1;\n\t\t\t\tlhs[j][i][1]=lhs[j][i][1]-comz4;\n\t\t\t\tlhs[j][i][2]=lhs[j][i][2]+comz6;\n\t\t\t\tlhs[j][i][3]=lhs[j][i][3]-comz4;\n\t\t\t\tlhs[j][i][4]=lhs[j][i][4]+comz1;\n\t\t\t}\n\t\t}\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tj=grid_points[1]-3;\n\t\t\tlhs[j][i][0]=lhs[j][i][0]+comz1;\n\t\t\tlhs[j][i][1]=lhs[j][i][1]-comz4;\n\t\t\tlhs[j][i][2]=lhs[j][i][2]+comz6;\n\t\t\tlhs[j][i][3]=lhs[j][i][3]-comz4;\n\t\t\tlhs[j+1][i][0]=lhs[j+1][i][0]+comz1;\n\t\t\tlhs[j+1][i][1]=lhs[j+1][i][1]-comz4;\n\t\t\tlhs[j+1][i][2]=lhs[j+1][i][2]+comz5;\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * subsequently, do the other two factors                    \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=1; j<=grid_points[1]-2; j++){\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tlhsp[j][i][0]=lhs[j][i][0];\n\t\t\t\tlhsp[j][i][1]=lhs[j][i][1]-dtty2*speed[k][j-1][i];\n\t\t\t\tlhsp[j][i][2]=lhs[j][i][2];\n\t\t\t\tlhsp[j][i][3]=lhs[j][i][3]+dtty2*speed[k][j+1][i];\n\t\t\t\tlhsp[j][i][4]=lhs[j][i][4];\n\t\t\t\tlhsm[j][i][0]=lhs[j][i][0];\n\t\t\t\tlhsm[j][i][1]=lhs[j][i][1]+dtty2*speed[k][j-1][i];\n\t\t\t\tlhsm[j][i][2]=lhs[j][i][2];\n\t\t\t\tlhsm[j][i][3]=lhs[j][i][3]-dtty2*speed[k][j+1][i];\n\t\t\t\tlhsm[j][i][4]=lhs[j][i][4];\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * FORWARD ELIMINATION  \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=0; j<=grid_points[1]-3; j++){\n\t\t\tj1=j+1;\n\t\t\tj2=j+2;\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tfac1=1.0/lhs[j][i][2];\n\t\t\t\tlhs[j][i][3]=fac1*lhs[j][i][3];\n\t\t\t\tlhs[j][i][4]=fac1*lhs[j][i][4];\n\t\t\t\tfor(m=0; m<3; m++){\n\t\t\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\t\t}\n\t\t\t\tlhs[j1][i][2]=lhs[j1][i][2]-lhs[j1][i][1]*lhs[j][i][3];\n\t\t\t\tlhs[j1][i][3]=lhs[j1][i][3]-lhs[j1][i][1]*lhs[j][i][4];\n\t\t\t\tfor(m=0; m<3; m++){\n\t\t\t\t\trhs[k][j1][i][m]=rhs[k][j1][i][m]-lhs[j1][i][1]*rhs[k][j][i][m];\n\t\t\t\t}\n\t\t\t\tlhs[j2][i][1]=lhs[j2][i][1]-lhs[j2][i][0]*lhs[j][i][3];\n\t\t\t\tlhs[j2][i][2]=lhs[j2][i][2]-lhs[j2][i][0]*lhs[j][i][4];\n\t\t\t\tfor(m=0; m<3; m++){\n\t\t\t\t\trhs[k][j2][i][m]=rhs[k][j2][i][m]-lhs[j2][i][0]*rhs[k][j][i][m];\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * the last two rows in this grid block are a bit different, \n\t\t * since they do not have two more rows available for the\n\t\t * elimination of off-diagonal entries\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tj=grid_points[1]-2;\n\t\tj1=grid_points[1]-1;\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tfac1=1.0/lhs[j][i][2];\n\t\t\tlhs[j][i][3]=fac1*lhs[j][i][3];\n\t\t\tlhs[j][i][4]=fac1*lhs[j][i][4];\n\t\t\tfor(m=0; m<3; m++){\n\t\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\t}\n\t\t\tlhs[j1][i][2]=lhs[j1][i][2]-lhs[j1][i][1]*lhs[j][i][3];\n\t\t\tlhs[j1][i][3]=lhs[j1][i][3]-lhs[j1][i][1]*lhs[j][i][4];\n\t\t\tfor(m=0; m<3; m++){\n\t\t\t\trhs[k][j1][i][m]=rhs[k][j1][i][m]-lhs[j1][i][1]*rhs[k][j][i][m];\n\t\t\t}\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * scale the last row immediately \n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfac2 = 1.0/lhs[j1][i][2];\n\t\t\tfor(m=0; m<3; m++){\n\t\t\t\trhs[k][j1][i][m]=fac2*rhs[k][j1][i][m];\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * do the u+c and the u-c factors                 \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=0; j<=grid_points[1]-3; j++){\n\t\t\tj1=j+1;\n\t\t\tj2=j+2;\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tm=3;\n\t\t\t\tfac1=1.0/lhsp[j][i][2];\n\t\t\t\tlhsp[j][i][3]=fac1*lhsp[j][i][3];\n\t\t\t\tlhsp[j][i][4]=fac1*lhsp[j][i][4];\n\t\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\t\tlhsp[j1][i][2]=lhsp[j1][i][2]-lhsp[j1][i][1]*lhsp[j][i][3];\n\t\t\t\tlhsp[j1][i][3]=lhsp[j1][i][3]-lhsp[j1][i][1]*lhsp[j][i][4];\n\t\t\t\trhs[k][j1][i][m]=rhs[k][j1][i][m]-lhsp[j1][i][1]*rhs[k][j][i][m];\n\t\t\t\tlhsp[j2][i][1]=lhsp[j2][i][1]-lhsp[j2][i][0]*lhsp[j][i][3];\n\t\t\t\tlhsp[j2][i][2]=lhsp[j2][i][2]-lhsp[j2][i][0]*lhsp[j][i][4];\n\t\t\t\trhs[k][j2][i][m]=rhs[k][j2][i][m]-lhsp[j2][i][0]*rhs[k][j][i][m];\n\t\t\t\tm=4;\n\t\t\t\tfac1=1.0/lhsm[j][i][2];\n\t\t\t\tlhsm[j][i][3]=fac1*lhsm[j][i][3];\n\t\t\t\tlhsm[j][i][4]=fac1*lhsm[j][i][4];\n\t\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\t\tlhsm[j1][i][2]=lhsm[j1][i][2]-lhsm[j1][i][1]*lhsm[j][i][3];\n\t\t\t\tlhsm[j1][i][3]=lhsm[j1][i][3]-lhsm[j1][i][1]*lhsm[j][i][4];\n\t\t\t\trhs[k][j1][i][m]=rhs[k][j1][i][m]-lhsm[j1][i][1]*rhs[k][j][i][m];\n\t\t\t\tlhsm[j2][i][1]=lhsm[j2][i][1]-lhsm[j2][i][0]*lhsm[j][i][3];\n\t\t\t\tlhsm[j2][i][2]=lhsm[j2][i][2]-lhsm[j2][i][0]*lhsm[j][i][4];\n\t\t\t\trhs[k][j2][i][m]=rhs[k][j2][i][m]-lhsm[j2][i][0]*rhs[k][j][i][m];\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * and again the last two rows separately\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tj=grid_points[1]-2;\n\t\tj1=grid_points[1]-1;\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tm=3;\n\t\t\tfac1=1.0/lhsp[j][i][2];\n\t\t\tlhsp[j][i][3]=fac1*lhsp[j][i][3];\n\t\t\tlhsp[j][i][4]=fac1*lhsp[j][i][4];\n\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\tlhsp[j1][i][2]=lhsp[j1][i][2]-lhsp[j1][i][1]*lhsp[j][i][3];\n\t\t\tlhsp[j1][i][3]=lhsp[j1][i][3]-lhsp[j1][i][1]*lhsp[j][i][4];\n\t\t\trhs[k][j1][i][m]=rhs[k][j1][i][m]-lhsp[j1][i][1]*rhs[k][j][i][m];\n\t\t\tm=4;\n\t\t\tfac1=1.0/lhsm[j][i][2];\n\t\t\tlhsm[j][i][3]=fac1*lhsm[j][i][3];\n\t\t\tlhsm[j][i][4]=fac1*lhsm[j][i][4];\n\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\tlhsm[j1][i][2]=lhsm[j1][i][2]-lhsm[j1][i][1]*lhsm[j][i][3];\n\t\t\tlhsm[j1][i][3]=lhsm[j1][i][3]-lhsm[j1][i][1]*lhsm[j][i][4];\n\t\t\trhs[k][j1][i][m]=rhs[k][j1][i][m]-lhsm[j1][i][1]*rhs[k][j][i][m];\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * scale the last row immediately \n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\trhs[k][j1][i][3]=rhs[k][j1][i][3]/lhsp[j1][i][2];\n\t\t\trhs[k][j1][i][4]=rhs[k][j1][i][4]/lhsm[j1][i][2];\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * BACKSUBSTITUTION \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tj=grid_points[1]-2;\n\t\tj1=grid_points[1]-1;\n\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\tfor(m=0; m<3; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-lhs[j][i][3]*rhs[k][j1][i][m];\n\t\t\t}\n\t\t\trhs[k][j][i][3]=rhs[k][j][i][3]-lhsp[j][i][3]*rhs[k][j1][i][3];\n\t\t\trhs[k][j][i][4]=rhs[k][j][i][4]-lhsm[j][i][3]*rhs[k][j1][i][4];\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * the first three factors\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(j=grid_points[1]-3; j>=0; j--){\n\t\t\tj1=j+1;\n\t\t\tj2=j+2;\n\t\t\tfor(i=1; i<=grid_points[0]-2; i++){\n\t\t\t\tfor(m=0; m<3; m++){\n\t\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]- \n\t\t\t\t\t\tlhs[j][i][3]*rhs[k][j1][i][m]-\n\t\t\t\t\t\tlhs[j][i][4]*rhs[k][j2][i][m];\n\t\t\t\t}\n\t\t\t\t/*\n\t\t\t\t * ---------------------------------------------------------------------\n\t\t\t\t * and the remaining two\n\t\t\t\t * ---------------------------------------------------------------------\n\t\t\t\t */\n\t\t\t\trhs[k][j][i][3]=rhs[k][j][i][3]- \n\t\t\t\t\tlhsp[j][i][3]*rhs[k][j1][i][3]-\n\t\t\t\t\tlhsp[j][i][4]*rhs[k][j2][i][3];\n\t\t\t\trhs[k][j][i][4]=rhs[k][j][i][4]- \n\t\t\t\t\tlhsm[j][i][3]*rhs[k][j1][i][4]-\n\t\t\t\t\tlhsm[j][i][4]*rhs[k][j2][i][4];\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_YSOLVE);}\n\tpinvr();\n}\n\n/*\n * ---------------------------------------------------------------------\n * this function performs the solution of the approximate factorization\n * step in the z-direction for all five matrix components\n * simultaneously. The Thomas algorithm is employed to solve the\n * systems for the z-lines. Boundary conditions are non-periodic\n * ---------------------------------------------------------------------\n */\nvoid z_solve(){\n\tint i, j, k, k1, k2, m;\n\tdouble ru1, fac1, fac2;\n\tif(timeron){timer_start(T_ZSOLVE);}\n\tfor(j=1; j<=ny2; j++){\n\t\tlhsinitj(nz2+1, nx2);\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * computes the left hand side for the three z-factors   \n\t\t * ---------------------------------------------------------------------\n\t\t * first fill the lhs for the u-eigenvalue                          \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(i=1; i<=nx2; i++){\n\t\t\tfor(k=0; k<=nz2+1; k++){\n\t\t\t\tru1=c3c4*rho_i[k][j][i];\n\t\t\t\tcv[k]=ws[k][j][i];\n\t\t\t\trhos[k]=max(max(dz4+con43*ru1, dz5+c1c5*ru1), max(dzmax+ru1, dz1));\n\t\t\t}\n\t\t\tfor(k=1; k<=nz2; k++){\n\t\t\t\tlhs[k][i][0]=0.0;\n\t\t\t\tlhs[k][i][1]=-dttz2*cv[k-1]-dttz1*rhos[k-1];\n\t\t\t\tlhs[k][i][2]=1.0+c2dttz1*rhos[k];\n\t\t\t\tlhs[k][i][3]=dttz2*cv[k+1]-dttz1*rhos[k+1];\n\t\t\t\tlhs[k][i][4]=0.0;\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * add fourth order dissipation                                  \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(i=1; i<=nx2; i++){\n\t\t\tk=1;\n\t\t\tlhs[k][i][2]=lhs[k][i][2]+comz5;\n\t\t\tlhs[k][i][3]=lhs[k][i][3]-comz4;\n\t\t\tlhs[k][i][4]=lhs[k][i][4]+comz1;\n\t\t\tk=2;\n\t\t\tlhs[k][i][1]=lhs[k][i][1]-comz4;\n\t\t\tlhs[k][i][2]=lhs[k][i][2]+comz6;\n\t\t\tlhs[k][i][3]=lhs[k][i][3]-comz4;\n\t\t\tlhs[k][i][4]=lhs[k][i][4]+comz1;\n\t\t}\n\t\tfor(k=3; k<=nz2-2; k++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tlhs[k][i][0]=lhs[k][i][0]+comz1;\n\t\t\t\tlhs[k][i][1]=lhs[k][i][1]-comz4;\n\t\t\t\tlhs[k][i][2]=lhs[k][i][2]+comz6;\n\t\t\t\tlhs[k][i][3]=lhs[k][i][3]-comz4;\n\t\t\t\tlhs[k][i][4]=lhs[k][i][4]+comz1;\n\t\t\t}\n\t\t}\n\t\tfor(i=1; i<=nx2; i++){\n\t\t\tk=nz2-1;\n\t\t\tlhs[k][i][0]=lhs[k][i][0]+comz1;\n\t\t\tlhs[k][i][1]=lhs[k][i][1]-comz4;\n\t\t\tlhs[k][i][2]=lhs[k][i][2]+comz6;\n\t\t\tlhs[k][i][3]=lhs[k][i][3]-comz4;\n\t\t\tk=nz2;\n\t\t\tlhs[k][i][0]=lhs[k][i][0]+comz1;\n\t\t\tlhs[k][i][1]=lhs[k][i][1]-comz4;\n\t\t\tlhs[k][i][2]=lhs[k][i][2]+comz5;\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * subsequently, fill the other factors (u+c), (u-c) \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(k=1; k<=nz2; k++){\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tlhsp[k][i][0]=lhs[k][i][0];\n\t\t\t\tlhsp[k][i][1]=lhs[k][i][1]-dttz2*speed[k-1][j][i];\n\t\t\t\tlhsp[k][i][2]=lhs[k][i][2];\n\t\t\t\tlhsp[k][i][3]=lhs[k][i][3]+dttz2*speed[k+1][j][i];\n\t\t\t\tlhsp[k][i][4]=lhs[k][i][4];\n\t\t\t\tlhsm[k][i][0]=lhs[k][i][0];\n\t\t\t\tlhsm[k][i][1]=lhs[k][i][1]+dttz2*speed[k-1][j][i];\n\t\t\t\tlhsm[k][i][2]=lhs[k][i][2];\n\t\t\t\tlhsm[k][i][3]=lhs[k][i][3]-dttz2*speed[k+1][j][i];\n\t\t\t\tlhsm[k][i][4]=lhs[k][i][4];\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * FORWARD ELIMINATION  \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(k=0; k<=grid_points[2]-3; k++){\n\t\t\tk1=k+1;\n\t\t\tk2=k+2;\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tfac1=1.0/lhs[k][i][2];\n\t\t\t\tlhs[k][i][3]=fac1*lhs[k][i][3];\n\t\t\t\tlhs[k][i][4]=fac1*lhs[k][i][4];\n\t\t\t\tfor(m=0; m<3; m++){\n\t\t\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\t\t}\n\t\t\t\tlhs[k1][i][2]=lhs[k1][i][2]-lhs[k1][i][1]*lhs[k][i][3];\n\t\t\t\tlhs[k1][i][3]=lhs[k1][i][3]-lhs[k1][i][1]*lhs[k][i][4];\n\t\t\t\tfor(m=0; m<3; m++){\n\t\t\t\t\trhs[k1][j][i][m]=rhs[k1][j][i][m]-lhs[k1][i][1]*rhs[k][j][i][m];\n\t\t\t\t}\n\t\t\t\tlhs[k2][i][1]=lhs[k2][i][1]-lhs[k2][i][0]*lhs[k][i][3];\n\t\t\t\tlhs[k2][i][2]=lhs[k2][i][2]-lhs[k2][i][0]*lhs[k][i][4];\n\t\t\t\tfor(m=0; m<3; m++){\n\t\t\t\t\trhs[k2][j][i][m]=rhs[k2][j][i][m]-lhs[k2][i][0]*rhs[k][j][i][m];\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * the last two rows in this grid block are a bit different, \n\t\t * since they do not have two more rows available for the\n\t\t * elimination of off-diagonal entries\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tk=grid_points[2]-2;\n\t\tk1=grid_points[2]-1;\n\t\tfor(i=1; i<=nx2; i++){\n\t\t\tfac1=1.0/lhs[k][i][2];\n\t\t\tlhs[k][i][3]=fac1*lhs[k][i][3];\n\t\t\tlhs[k][i][4]=fac1*lhs[k][i][4];\n\t\t\tfor(m=0; m<3; m++){\n\t\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\t}\n\t\t\tlhs[k1][i][2]=lhs[k1][i][2]-lhs[k1][i][1]*lhs[k][i][3];\n\t\t\tlhs[k1][i][3]=lhs[k1][i][3]-lhs[k1][i][1]*lhs[k][i][4];\n\t\t\tfor(m=0; m<3; m++){\n\t\t\t\trhs[k1][j][i][m]=rhs[k1][j][i][m]-lhs[k1][i][1]*rhs[k][j][i][m];\n\t\t\t}\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * scale the last row immediately\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\tfac2=1.0/lhs[k1][i][2];\n\t\t\tfor(m=0; m<3; m++){\n\t\t\t\trhs[k1][j][i][m]=fac2*rhs[k1][j][i][m];\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * do the u+c and the u-c factors               \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(k=0; k<=grid_points[2]-3; k++){\n\t\t\tk1=k+1;\n\t\t\tk2=k+2;\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tm=3;\n\t\t\t\tfac1=1.0/lhsp[k][i][2];\n\t\t\t\tlhsp[k][i][3]=fac1*lhsp[k][i][3];\n\t\t\t\tlhsp[k][i][4]=fac1*lhsp[k][i][4];\n\t\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\t\tlhsp[k1][i][2]=lhsp[k1][i][2]-lhsp[k1][i][1]*lhsp[k][i][3];\n\t\t\t\tlhsp[k1][i][3]=lhsp[k1][i][3]-lhsp[k1][i][1]*lhsp[k][i][4];\n\t\t\t\trhs[k1][j][i][m]=rhs[k1][j][i][m]-lhsp[k1][i][1]*rhs[k][j][i][m];\n\t\t\t\tlhsp[k2][i][1]=lhsp[k2][i][1]-lhsp[k2][i][0]*lhsp[k][i][3];\n\t\t\t\tlhsp[k2][i][2]=lhsp[k2][i][2]-lhsp[k2][i][0]*lhsp[k][i][4];\n\t\t\t\trhs[k2][j][i][m]=rhs[k2][j][i][m]-lhsp[k2][i][0]*rhs[k][j][i][m];\n\t\t\t\tm=4;\n\t\t\t\tfac1=1.0/lhsm[k][i][2];\n\t\t\t\tlhsm[k][i][3]=fac1*lhsm[k][i][3];\n\t\t\t\tlhsm[k][i][4]=fac1*lhsm[k][i][4];\n\t\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\t\tlhsm[k1][i][2]=lhsm[k1][i][2]-lhsm[k1][i][1]*lhsm[k][i][3];\n\t\t\t\tlhsm[k1][i][3]=lhsm[k1][i][3]-lhsm[k1][i][1]*lhsm[k][i][4];\n\t\t\t\trhs[k1][j][i][m]=rhs[k1][j][i][m]-lhsm[k1][i][1]*rhs[k][j][i][m];\n\t\t\t\tlhsm[k2][i][1]=lhsm[k2][i][1]-lhsm[k2][i][0]*lhsm[k][i][3];\n\t\t\t\tlhsm[k2][i][2]=lhsm[k2][i][2]-lhsm[k2][i][0]*lhsm[k][i][4];\n\t\t\t\trhs[k2][j][i][m]=rhs[k2][j][i][m]-lhsm[k2][i][0]*rhs[k][j][i][m];\n\t\t\t}\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * and again the last two rows separately\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tk=grid_points[2]-2;\n\t\tk1=grid_points[2]-1;\n\t\tfor(i=1; i<=nx2; i++){\n\t\t\tm=3;\n\t\t\tfac1=1.0/lhsp[k][i][2];\n\t\t\tlhsp[k][i][3]=fac1*lhsp[k][i][3];\n\t\t\tlhsp[k][i][4]=fac1*lhsp[k][i][4];\n\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\tlhsp[k1][i][2]=lhsp[k1][i][2]-lhsp[k1][i][1]*lhsp[k][i][3];\n\t\t\tlhsp[k1][i][3]=lhsp[k1][i][3]-lhsp[k1][i][1]*lhsp[k][i][4];\n\t\t\trhs[k1][j][i][m]=rhs[k1][j][i][m]-lhsp[k1][i][1]*rhs[k][j][i][m];\n\t\t\tm=4;\n\t\t\tfac1=1.0/lhsm[k][i][2];\n\t\t\tlhsm[k][i][3]=fac1*lhsm[k][i][3];\n\t\t\tlhsm[k][i][4]=fac1*lhsm[k][i][4];\n\t\t\trhs[k][j][i][m]=fac1*rhs[k][j][i][m];\n\t\t\tlhsm[k1][i][2]=lhsm[k1][i][2]-lhsm[k1][i][1]*lhsm[k][i][3];\n\t\t\tlhsm[k1][i][3]=lhsm[k1][i][3]-lhsm[k1][i][1]*lhsm[k][i][4];\n\t\t\trhs[k1][j][i][m]=rhs[k1][j][i][m]-lhsm[k1][i][1]*rhs[k][j][i][m];\n\t\t\t/*\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t * scale the last row immediately (some of this is overkill\n\t\t\t * if this is the last cell)\n\t\t\t * ---------------------------------------------------------------------\n\t\t\t */\n\t\t\trhs[k1][j][i][3]=rhs[k1][j][i][3]/lhsp[k1][i][2];\n\t\t\trhs[k1][j][i][4]=rhs[k1][j][i][4]/lhsm[k1][i][2];\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * BACKSUBSTITUTION \n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tk=grid_points[2]-2;\n\t\tk1=grid_points[2]-1;\n\t\tfor(i=1; i<=nx2; i++){\n\t\t\tfor(m=0; m<3; m++){\n\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]-lhs[k][i][3]*rhs[k1][j][i][m];\n\t\t\t}\n\t\t\trhs[k][j][i][3]=rhs[k][j][i][3]-lhsp[k][i][3]*rhs[k1][j][i][3];\n\t\t\trhs[k][j][i][4]=rhs[k][j][i][4]-lhsm[k][i][3]*rhs[k1][j][i][4];\n\t\t}\n\t\t/*\n\t\t * ---------------------------------------------------------------------\n\t\t * whether or not this is the last processor, we always have\n\t\t * to complete the back-substitution \n\t\t * ---------------------------------------------------------------------\n\t\t * the first three factors\n\t\t * ---------------------------------------------------------------------\n\t\t */\n\t\tfor(k=grid_points[2]-3; k>=0; k--){\n\t\t\tk1=k+1;\n\t\t\tk2=k+2;\n\t\t\tfor(i=1; i<=nx2; i++){\n\t\t\t\tfor (m = 0; m < 3; m++) {\n\t\t\t\t\trhs[k][j][i][m]=rhs[k][j][i][m]- \n\t\t\t\t\t\tlhs[k][i][3]*rhs[k1][j][i][m]-\n\t\t\t\t\t\tlhs[k][i][4]*rhs[k2][j][i][m];\n\t\t\t\t}\n\t\t\t\t/*\n\t\t\t\t * ---------------------------------------------------------------------\n\t\t\t\t * and the remaining two\n\t\t\t\t * ---------------------------------------------------------------------\n\t\t\t\t */\n\t\t\t\trhs[k][j][i][3]=rhs[k][j][i][3]- \n\t\t\t\t\tlhsp[k][i][3]*rhs[k1][j][i][3]-\n\t\t\t\t\tlhsp[k][i][4]*rhs[k2][j][i][3];\n\t\t\t\trhs[k][j][i][4]=rhs[k][j][i][4]- \n\t\t\t\t\tlhsm[k][i][3]*rhs[k1][j][i][4]-\n\t\t\t\t\tlhsm[k][i][4]*rhs[k2][j][i][4];\n\t\t\t}\n\t\t}\n\t}\n\tif(timeron){timer_stop(T_ZSOLVE);}\n\ttzetar();\n}", "label": 1}
{"code": "\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <errno.h>\n#include <ctype.h>\n#include <math.h>\n\n#ifdef _OPENMP\n#include <omp.h>\n#endif\n\n\n#include \"adc.h\"\n#include \"macrodef.h\"\n#include \"npbparams.h\"\n\n#ifdef UNIX\n#include <sys/types.h>\n#include <unistd.h>\n\n#define MAX_TIMERS 64  /* NPB maximum timers */\n  void    timer_clear(int);\n  void    timer_start(int);\n  void    timer_stop(int); \n  double  timer_read(int);\n#endif\n\nvoid c_print_results( char   *name,\n                      char   clss,\n                      int    n1, \n                      int    n2,\n                      int    n3,\n                      int    niter,\n                      double t,\n                      double mops,\n\t\t      char   *optype,\n                      int    passed_verification,\n                      char   *npbversion,\n                      char   *compiletime,\n                      char   *cc,\n                      char   *clink,\n                      char   *c_lib,\n                      char   *c_inc,\n                      char   *cflags,\n                      char   *clinkflags );\n\nvoid initADCpar(ADC_PAR *par);\nint ParseParFile(char* parfname, ADC_PAR *par); \nint GenerateADC(ADC_PAR *par);\nvoid ShowADCPar(ADC_PAR *par);\nint32 DC(ADC_VIEW_PARS *adcpp);\nint Verify(long long int checksum,ADC_VIEW_PARS *adcpp);\n\n#define BlockSize 1024\n\nint main ( int argc, char * argv[] ) \n{\n  ADC_PAR *parp;\n  ADC_VIEW_PARS *adcpp;\n  int32 retCode;\n\n  fprintf(stdout,\"\\n\\n NAS Parallel Benchmarks (NPB3.3-OMP) - DC Benchmark\\n\\n\" );\n  if(argc!=3){\n    fprintf(stdout,\" No Paramter file. Using compiled defaults\\n\");\n  }\n  if(argc>3 || (argc>1 && !isdigit(argv[1][0]))){\n    fprintf(stderr,\"Usage: <program name> <amount of memory>\\n\");\n    fprintf(stderr,\"       <file of parameters>\\n\");\n    fprintf(stderr,\"Example: bin/dc.S 1000000 DC/ADC.par\\n\");\n    fprintf(stderr,\"The last argument, (a parameter file) can be skipped\\n\");\n    exit(1);\n  }\n\n  if(  !(parp = (ADC_PAR*) malloc(sizeof(ADC_PAR)))\n     ||!(adcpp = (ADC_VIEW_PARS*) malloc(sizeof(ADC_VIEW_PARS)))){\n     PutErrMsg(\"main: malloc failed\")\n     exit(1);\n  }\n  initADCpar(parp);\n  parp->clss=CLASS;\n  if(argc!=3){\n    parp->dim=attrnum;\n    parp->tuplenum=input_tuples;    \n  }else if( (argc==3)&&(!ParseParFile(argv[2], parp))) {\n    PutErrMsg(\"main.ParseParFile failed\")\n    exit(1);\n  }\n  ShowADCPar(parp); \n  if(!GenerateADC(parp)) {\n     PutErrMsg(\"main.GenerateAdc failed\")\n     exit(1);\n  }\n\n  adcpp->ndid = parp->ndid;  \n  adcpp->clss = parp->clss;\n  adcpp->nd = parp->dim;\n  adcpp->nm = parp->mnum;\n  adcpp->nTasks = 1;\n\n  if(argc>=2)\n    adcpp->memoryLimit = atoi(argv[1]);\n  else\n    adcpp->memoryLimit = 0;\n  if(adcpp->memoryLimit <= 0){\n    /* size of rb-tree with tuplenum nodes */\n    adcpp->memoryLimit = parp->tuplenum*(50+5*parp->dim); \n    fprintf(stdout,\"Estimated rb-tree size = %d \\n\", adcpp->memoryLimit);\n  }\n  adcpp->nInputRecs = parp->tuplenum;\n  strcpy(adcpp->adcName, parp->filename);\n  strcpy(adcpp->adcInpFileName, parp->filename);\n\n  if((retCode=DC(adcpp))) {\n     PutErrMsg(\"main.DC failed\")\n     fprintf(stderr, \"main.ParRun failed: retcode = %d\\n\", retCode);\n     exit(1);\n  }\n\n  if(parp)  { free(parp);   parp = 0; }\n  if(adcpp) { free(adcpp); adcpp = 0; }\n  return 0;\n}\n\nint32\t\t CloseAdcView(ADC_VIEW_CNTL *adccntl);  \nint32\t\t PartitionCube(ADC_VIEW_CNTL *avp);\t\t\t\t\nADC_VIEW_CNTL *NewAdcViewCntl(ADC_VIEW_PARS *adcpp, uint32 pnum);\nint32\t\t ComputeGivenGroupbys(ADC_VIEW_CNTL *adccntl);\n\nint32 DC(ADC_VIEW_PARS *adcpp) {\n   int32 itsk=0;\n   double t_total=0.0;\n   int verified;\n\n   typedef struct { \n      int    verificationFailed;\n      uint32 totalViewTuples;\n      uint64 totalViewSizesInBytes;\n      uint32 totalNumberOfMadeViews;\n      uint64 checksum;\n      double tm_max;\n   } PAR_VIEW_ST;\n   \n   PAR_VIEW_ST *pvstp;\n\n   pvstp = (PAR_VIEW_ST*) malloc(sizeof(PAR_VIEW_ST));\n   pvstp->verificationFailed = 0;\n   pvstp->totalViewTuples = 0;\n   pvstp->totalViewSizesInBytes = 0;\n   pvstp->totalNumberOfMadeViews = 0;\n   pvstp->checksum = 0;\n\n#ifdef _OPENMP    \n   adcpp->nTasks=omp_get_max_threads();\n   fprintf(stdout,\"\\nNumber of available threads:  %d\\n\", adcpp->nTasks);\n   if (adcpp->nTasks > MAX_NUMBER_OF_TASKS) {\n      adcpp->nTasks = MAX_NUMBER_OF_TASKS;\n      fprintf(stdout,\"Warning: Maximum number of tasks reached: %d\\n\",\n              adcpp->nTasks);\n   }\n#pragma omp parallel shared(pvstp) private(itsk)\n#endif\n  {\n   double tm0=0;\n   int itimer=0;\n   ADC_VIEW_CNTL *adccntlp;\n#ifdef _OPENMP\n   itsk=omp_get_thread_num();\n#endif\n   adccntlp = NewAdcViewCntl(adcpp, itsk);\n\n   if (!adccntlp) { \n      PutErrMsg(\"ParRun.NewAdcViewCntl: returned NULL\")\n      adccntlp->verificationFailed=1;\n   }else{\n     adccntlp->verificationFailed = 0;\n     if (adccntlp->retCode!=0) {\n   \tfprintf(stderr, \n   \t\t \"DC.NewAdcViewCntl: return code = %d\\n\",\n   \t\t\t\t\t\tadccntlp->retCode); \n     }\n   }\n\n   if (!adccntlp->verificationFailed) {\n     if( PartitionCube(adccntlp) ) {\n        PutErrMsg(\"DC.PartitionCube failed\");\n     }\n     timer_clear(itimer);\n     timer_start(itimer);\n     if( ComputeGivenGroupbys(adccntlp) ) {\n        PutErrMsg(\"DC.ComputeGivenGroupbys failed\");\n     }\n     timer_stop(itimer);\n     tm0 = timer_read(itimer);\n   }\n#ifdef _OPENMP    \n#pragma omp critical\n#endif\n   {\n     if(pvstp->tm_max<tm0) pvstp->tm_max=tm0;\n     pvstp->verificationFailed += adccntlp->verificationFailed;\n     if (!adccntlp->verificationFailed) {\n       pvstp->totalNumberOfMadeViews += adccntlp->numberOfMadeViews;\n       pvstp->totalViewSizesInBytes += adccntlp->totalViewFileSize;\n       pvstp->totalViewTuples += adccntlp->totalOfViewRows;\n       pvstp->checksum += adccntlp->totchs[0];\n     }   \n   }\n   if(CloseAdcView(adccntlp)) {\n     PutErrMsg(\"ParRun.CloseAdcView: is failed\");\n     adccntlp->verificationFailed = 1;\n   }\n } /* omp parallel */\n\n   t_total=pvstp->tm_max; \n \n   pvstp->verificationFailed=Verify(pvstp->checksum,adcpp);\n   verified = (pvstp->verificationFailed == -1)? -1 :\n              (pvstp->verificationFailed ==  0)?  1 : 0;\n\n   fprintf(stdout,\"\\n*** DC Benchmark Results:\\n\");\n   fprintf(stdout,\" Benchmark Time   = %20.3f\\n\", t_total);\n   fprintf(stdout,\" Input Tuples     =         %12d\\n\", (int) adcpp->nInputRecs);\n   fprintf(stdout,\" Number of Views  =         %12d\\n\",\n           (int) pvstp->totalNumberOfMadeViews);\n   fprintf(stdout,\" Number of Tasks  =         %12d\\n\", (int) adcpp->nTasks);\n   fprintf(stdout,\" Tuples Generated = %20.0f\\n\",\n           (double) pvstp->totalViewTuples);\n   fprintf(stdout,\" Tuples/s         = %20.2f\\n\", \n           (double) pvstp->totalViewTuples / t_total);\n   fprintf(stdout,\" Checksum         = %20.12e\\n\", (double) pvstp->checksum);\n   if (pvstp->verificationFailed)\n      fprintf(stdout, \" Verification failed\\n\");\n\n   c_print_results(\"DC\",\n  \t\t   adcpp->clss,\n  \t\t   (int)adcpp->nInputRecs,\n                   0,\n                   0,\n                   1,\n  \t\t   t_total,\n  \t\t   (double) pvstp->totalViewTuples * 1.e-6 / t_total, \n  \t\t   \"Tuples generated\", \n  \t\t   verified,\n  \t\t   NPBVERSION,\n  \t\t   COMPILETIME,\n  \t\t   CC,\n  \t\t   CLINK,\n  \t\t   C_LIB,\n  \t\t   C_INC,\n  \t\t   CFLAGS,\n  \t\t   CLINKFLAGS); \n   return ADC_OK;\n}\n\nlong long checksumS=464620213;\nlong long checksumWlo=434318;\nlong long checksumWhi=1401796;\nlong long checksumAlo=178042;\nlong long checksumAhi=7141688;\nlong long checksumBlo=700453;\nlong long checksumBhi=9348365;\n\nint Verify(long long int checksum,ADC_VIEW_PARS *adcpp){\n  switch(adcpp->clss){\n    case 'S':\n      if(checksum==checksumS) return 0;\n      break;\n    case 'W':\n      if(checksum==checksumWlo+1000000*checksumWhi) return 0;\n      break;\n    case 'A':\n      if(checksum==checksumAlo+1000000*checksumAhi) return 0;\n      break;\n    case 'B':\n      if(checksum==checksumBlo+1000000*checksumBhi) return 0;\n      break;\n    default:\n      return -1; /* CLASS U */\n  }\n  return 1;\n}\n\n", "label": 1}
{"code": "/*\n * =====================================================================================\n *\n *       Filename:  suite.c\n *\n *    Description:  The main wrapper for the suite\n *\n *        Version:  1.0\n *        Created:  10/22/2009 08:40:34 PM\n *       Revision:  none\n *       Compiler:  gcc\n *\n *         Author:  Liang Wang (lw2aw), lw2aw@virginia.edu\n *        Company:  CS@UVa\n *\n * =====================================================================================\n */\n\n#include <stdio.h>\n#include <unistd.h>\n#include <getopt.h>\n#include <stdlib.h>\n#include <assert.h>\n\n#include \"common.h\"\n\nstatic int do_verify = 0;\nint omp_num_threads = 1;\n\nstatic struct option long_options[] = {\n    /* name, has_arg, flag, val */\n    {\"input\", 1, NULL, 'i'},\n    {\"size\", 1, NULL, 's'},\n    {\"verify\", 0, NULL, 'v'},\n    {0, 0, 0, 0}};\n\nextern void lud_omp(float *m, int matrix_dim);\n\nint main(int argc, char *argv[]) {\n    int matrix_dim = 32; /* default size */\n    int opt, option_index = 0;\n    func_ret_t ret;\n    const char *input_file = NULL;\n    float *m, *mm;\n    stopwatch sw;\n\n\n    while ((opt = getopt_long(argc, argv, \"::vs:i:\", long_options,\n                              &option_index)) != -1) {\n        switch (opt) {\n        case 'i':\n            input_file = optarg;\n            break;\n        case 'v':\n            do_verify = 1;\n            break;\n        case 's':\n            matrix_dim = atoi(optarg);\n            printf(\"Generate input matrix internally, size =%d\\n\", matrix_dim);\n            // fprintf(stderr, \"Currently not supported, use -i instead\\n\");\n            // fprintf(stderr, \"Usage: %s [-v] [-s matrix_size|-i\n            // input_file]\\n\", argv[0]);\n            // exit(EXIT_FAILURE);\n            break;\n        case '?':\n            fprintf(stderr, \"invalid option\\n\");\n            break;\n        case ':':\n            fprintf(stderr, \"missing argument\\n\");\n            break;\n        default:\n            fprintf(stderr, \"Usage: %s [-v] [-s matrix_size|-i input_file]\\n\",\n                    argv[0]);\n            exit(EXIT_FAILURE);\n        }\n    }\n\n    if ((optind < argc) || (optind == 1)) {\n        fprintf(stderr, \"Usage: %s [-v] [-n no. of threads] [-s matrix_size|-i \"\n                        \"input_file]\\n\",\n                argv[0]);\n        exit(EXIT_FAILURE);\n    }\n\n    if (input_file) {\n        printf(\"Reading matrix from file %s\\n\", input_file);\n        ret = create_matrix_from_file(&m, input_file, &matrix_dim);\n        if (ret != RET_SUCCESS) {\n            m = NULL;\n            fprintf(stderr, \"error create matrix from file %s\\n\", input_file);\n            exit(EXIT_FAILURE);\n        }\n    } else if (matrix_dim) {\n        printf(\"Creating matrix internally size=%d\\n\", matrix_dim);\n        ret = create_matrix(&m, matrix_dim);\n        if (ret != RET_SUCCESS) {\n            m = NULL;\n            fprintf(stderr, \"error create matrix internally size=%d\\n\",\n                    matrix_dim);\n            exit(EXIT_FAILURE);\n        }\n    }\n\n    else {\n        printf(\"No input file specified!\\n\");\n        exit(EXIT_FAILURE);\n    }\n\n    if (do_verify) {\n        printf(\"Before LUD\\n\");\n        /* print_matrix(m, matrix_dim); */\n        matrix_duplicate(m, &mm, matrix_dim);\n    }\n\n\n    stopwatch_start(&sw);\n    lud_omp(m, matrix_dim);\n    stopwatch_stop(&sw);\n    printf(\"Time consumed(ms): %lf\\n\", 1000 * get_interval_by_sec(&sw));\n\n    if (do_verify) {\n        printf(\"After LUD\\n\");\n        /* print_matrix(m, matrix_dim); */\n        printf(\">>>Verify<<<<\\n\");\n        lud_verify(mm, m, matrix_dim);\n        free(mm);\n    }\n\n    free(m);\n\n    return EXIT_SUCCESS;\n} /* ----------  end of function main  ---------- */\n", "label": 2}
{"code": "\n\n#include <stdio.h>\n#include <unistd.h>\n#include <stdlib.h>\n#include <string.h>\n#include <limits.h>\n#include <math.h>\n#include <sys/types.h>\n#include <fcntl.h>\n//#include <omp.h>\n#include \"getopt.h\"\n\n#include \"kmeans.h\"\n\nextern double wtime(void);\n\n/*---< usage() >------------------------------------------------------------*/\nvoid usage(char *argv0) {\n    char *help =\n        \"Usage: %s [switches] -i filename\\n\"\n        \"       -i filename     \t\t: file containing data to be \"\n        \"clustered\\n\"\n        \"       -b                 \t: input file is in binary format\\n\"\n        \"       -k                 \t: number of clusters (default is 5) \\n\"\n        \"       -t threshold\t\t: threshold value\\n\"\n        \"       -n no. of threads\t: number of threads\";\n    fprintf(stderr, help, argv0);\n    exit(-1);\n}\n\n/*---< main() >-------------------------------------------------------------*/\nint main(int argc, char **argv) {\n    int opt;\n    extern char *optarg;\n    extern int optind;\n    int nclusters = 5;\n    char *filename = 0;\n    float *buf;\n    float **attributes;\n    float **cluster_centres = NULL;\n    int i, j;\n\n    int numAttributes;\n    int numObjects;\n    char line[1024];\n    int isBinaryFile = 0;\n    int nloops = 1;\n    float threshold = 0.001;\n    double timing;\n\n    while ((opt = getopt(argc, argv, \"i:k:t:b:n:?\")) != EOF) {\n        switch (opt) {\n        case 'i':\n            filename = optarg;\n            break;\n        case 'b':\n            isBinaryFile = 1;\n            break;\n        case 't':\n            threshold = atof(optarg);\n            break;\n        case 'k':\n            nclusters = atoi(optarg);\n            break;\n        case '?':\n            usage(argv[0]);\n            break;\n        default:\n            usage(argv[0]);\n            break;\n        }\n    }\n\n\n    if (filename == 0)\n        usage(argv[0]);\n\n    numAttributes = numObjects = 0;\n\n    /* from the input file, get the numAttributes and numObjects ------------*/\n\n    if (isBinaryFile) {\n        int infile;\n        if ((infile = open(filename, O_RDONLY, \"0600\")) == -1) {\n            fprintf(stderr, \"Error: no such file (%s)\\n\", filename);\n            exit(1);\n        }\n        read(infile, &numObjects, sizeof(int));\n        read(infile, &numAttributes, sizeof(int));\n\n\n        /* allocate space for attributes[] and read attributes of all objects */\n        buf = (float *)malloc(numObjects * numAttributes * sizeof(float));\n        attributes = (float **)malloc(numObjects * sizeof(float *));\n        attributes[0] =\n            (float *)malloc(numObjects * numAttributes * sizeof(float));\n        for (i = 1; i < numObjects; i++)\n            attributes[i] = attributes[i - 1] + numAttributes;\n\n        read(infile, buf, numObjects * numAttributes * sizeof(float));\n\n        close(infile);\n    } else {\n        FILE *infile;\n        if ((infile = fopen(filename, \"r\")) == NULL) {\n            fprintf(stderr, \"Error: no such file (%s)\\n\", filename);\n            exit(1);\n        }\n        while (fgets(line, 1024, infile) != NULL)\n            if (strtok(line, \" \\t\\n\") != 0)\n                numObjects++;\n        rewind(infile);\n        while (fgets(line, 1024, infile) != NULL) {\n            if (strtok(line, \" \\t\\n\") != 0) {\n                /* ignore the id (first attribute): numAttributes = 1; */\n                while (strtok(NULL, \" ,\\t\\n\") != NULL)\n                    numAttributes++;\n                break;\n            }\n        }\n\n\n        /* allocate space for attributes[] and read attributes of all objects */\n        buf = (float *)malloc(numObjects * numAttributes * sizeof(float));\n        attributes = (float **)malloc(numObjects * sizeof(float *));\n        attributes[0] =\n            (float *)malloc(numObjects * numAttributes * sizeof(float));\n        for (i = 1; i < numObjects; i++)\n            attributes[i] = attributes[i - 1] + numAttributes;\n        rewind(infile);\n        i = 0;\n        while (fgets(line, 1024, infile) != NULL) {\n            if (strtok(line, \" \\t\\n\") == NULL)\n                continue;\n            for (j = 0; j < numAttributes; j++) {\n                buf[i] = atof(strtok(NULL, \" ,\\t\\n\"));\n                i++;\n            }\n        }\n        fclose(infile);\n    }\n    printf(\"I/O completed\\n\");\n\n    memcpy(attributes[0], buf, numObjects * numAttributes * sizeof(float));\n\n    //timing = omp_get_wtime();\n    for (i = 0; i < nloops; i++) {\n\n        cluster_centres = NULL;\n        cluster(numObjects, numAttributes,\n                attributes, /* [numObjects][numAttributes] */\n                nclusters, threshold, &cluster_centres);\n    }\n    //timing = omp_get_wtime() - timing;\n\n\n    printf(\"number of Clusters %d\\n\", nclusters);\n    printf(\"number of Attributes %d\\n\\n\", numAttributes);\n\n    //if (getenv(\"OUTPUT\")) {\n    if(1){\n        //FILE *file = fopen(\"output.txt\", \"w+\");\n        // Cluster Centers Output\n        // The first number is cluster number and the following data is arribute\n        // value\n        for (i = 0; i < nclusters; i++) {\n            printf( \"%d: \", i);\n            for (j = 0; j < numAttributes; j++)\n                printf( \"%.2f \", cluster_centres[i][j]);\n            printf( \"\\n\\n\");\n        }\n\n        //fclose(file);\n    }\n    printf(\"Time for process: %f\\n\", timing);\n\n    free(attributes);\n    free(cluster_centres[0]);\n    free(cluster_centres);\n    free(buf);\n    return (0);\n}\n", "label": 2}
{"code": "/*\n * Copyright (C) 2008 Princeton University\n * All rights reserved.\n * Authors: Jia Deng, Gilberto Contreras\n *\n * streamcluster - Online clustering algorithm\n *\n */\n#include <stdio.h>\n#include <iostream>\n#include <fstream>\n#include <stdlib.h>\n#include <string.h>\n#include <assert.h>\n#include <math.h>\n#include <sys/resource.h>\n#include <limits.h>\n\n#ifdef ENABLE_THREADS\n#include <pthread.h>\n#include \"parsec_barrier.hpp\"\n#endif\n\n#ifdef TBB_VERSION\n#define TBB_STEALER (tbb::task_scheduler_init::occ_stealer)\n#define NUM_DIVISIONS (nproc)\n#include \"tbb/task_scheduler_init.h\"\n#include \"tbb/blocked_range.h\"\n#include \"tbb/parallel_for.h\"\n#include \"tbb/parallel_reduce.h\"\n#include \"tbb/cache_aligned_allocator.h\"\nusing namespace tbb;\n#endif\n\n#ifdef ENABLE_PARSEC_HOOKS\n#include <hooks.h>\n#endif\n\nusing namespace std;\n\n#define MAXNAMESIZE 1024 // max filename length\n#define SEED 1\n/* increase this to reduce probability of random error */\n/* increasing it also ups running time of \"speedy\" part of the code */\n/* SP = 1 seems to be fine */\n#define SP 1 // number of repetitions of speedy must be >=1\n\n/* higher ITER --> more likely to get correct # of centers */\n/* higher ITER also scales the running time almost linearly */\n#define ITER 3 // iterate ITER* k log k times; ITER >= 1\n\n#define CACHE_LINE 32 // cache line in byte\n\n/* this structure represents a point */\n/* these will be passed around to avoid copying coordinates */\ntypedef struct {\n  float weight;\n  float *coord;\n  long assign;  /* number of point where this one is assigned */\n  float cost;  /* cost of that assignment, weight*distance */\n} Point;\n\n/* this is the array of points */\ntypedef struct {\n  long num; /* number of points; may not be N if this is a sample */\n  int dim;  /* dimensionality */\n  Point *p; /* the array itself */\n} Points;\n\nstatic bool *switch_membership; //whether to switch membership in pgain\nstatic bool* is_center; //whether a point is a center\nstatic int* center_table; //index table of centers\n\nstatic int nproc; //# of threads\n\n\n#ifdef TBB_VERSION\ntbb::cache_aligned_allocator<float> memoryFloat;\ntbb::cache_aligned_allocator<Point> memoryPoint;\ntbb::cache_aligned_allocator<long> memoryLong;\ntbb::cache_aligned_allocator<int> memoryInt;\ntbb::cache_aligned_allocator<bool> memoryBool;\n#endif\n\n\nfloat dist(Point p1, Point p2, int dim);\n\n\n#ifdef TBB_VERSION\nstruct HizReduction {\nprivate:\n  double hiz;\npublic:\n  Points *points;\n  HizReduction(Points *points_): hiz(0),points(points_){}\n  HizReduction( HizReduction &d, tbb::split){hiz=0; points = d.points;}\n\n  void operator()(const tbb::blocked_range<int>& range) {\n    double myhiz = 0;\n    long ptDimension = points->dim;\n    int begin = range.begin();\n    int end = range.end();\n    \n    for(int kk=begin; kk!=end; kk++) {\n      myhiz += dist(points->p[kk], points->p[0],\n\t\t\t ptDimension)*points->p[kk].weight;\n    }\n    hiz += myhiz;\n  }\n\n  void join(HizReduction &d){hiz += d.getHiz(); /*fprintf(stderr,\"reducing: %lf\\n\",hiz);*/}\n  double getHiz(){return hiz;}\n\n};\n\n\nstruct CenterCreate {\n  Points *points;\n  CenterCreate(Points *p): points(p){}\n  void operator()(const tbb::blocked_range<int>&range) const {\n    int begin = range.begin();\n    int end = range.end();\n    \n     for( int k = begin; k!=end; k++ )    {\n       float distance = dist(points->p[k],points->p[0],points->dim);\n       points->p[k].cost = distance * points->p[k].weight;\n       points->p[k].assign=0;\n     } \n  }\n\n};\n\n\n\nstruct CenterOpen {\nprivate:\n  double total_cost;\npublic:\n  Points *points;\n  int i;\n  int type; /*type=0: compute. type=1: reduction */\n  CenterOpen(Points *p):points(p),total_cost(0),type(0){}\n  CenterOpen(CenterOpen &rhs, tbb::split) \n  {\n    total_cost = 0; \n    points = rhs.points;\n    i = rhs.i;\n    type = rhs.type;\n  }\n\n  void operator()(const tbb::blocked_range<int> &range) {\n    int begin = range.begin();\n    int end = range.end();\n\n    if(type) {\n      double local_total = 0.0;\n      for(int k = begin; k!=end; k++ )  \n\tlocal_total+=points->p[k].cost;\n      total_cost += local_total;\n    }\n    else {\n      for(int k = begin; k!=end; k++ )  {\n\tfloat distance = dist(points->p[i],points->p[k],points->dim);\n\tif( i && distance*points->p[k].weight < points->p[k].cost )  {\n\t  points->p[k].cost = distance * points->p[k].weight;\n\t  points->p[k].assign=i;\n\t}\n      }\n    }\n    \n  }\n\n\n  void join(CenterOpen &lhs){total_cost+=lhs.getTotalCost();}\n  double getTotalCost(){return total_cost;}\n\n};\n\n\n\nclass CenterTableCount: public tbb::task{\nprivate:\n  Points *points;\n  double *work_mem;\n  int stride;\n  int pid;\npublic:\n  CenterTableCount(int id, int s, Points *p, double *mem):\n    pid(id), stride(s), points(p),work_mem(mem){}\n\n  task *execute() {\n    int count = 0;\n    long bsize = points->num/((NUM_DIVISIONS));\n    long k1 = bsize * pid;\n    long k2 = k1 + bsize;\n\n    if( pid == (NUM_DIVISIONS)-1 ) \n      k2 = points->num;\n\n    /* fprintf(stderr,\"\\t[CenterTableCount]: pid=%d stride=%d from %d to %d\\n\",\n       pid, stride, k1, k2); */\n\n    for( int i = k1; i < k2; i++ ) {\n      if( is_center[i] ) {\n\tcenter_table[i] = count++;\n      }\n    }\n\n    work_mem[pid*stride] = count;\n    //fprintf(stderr,\"PID %d done!\\n\",pid);\n    return NULL;\n  }\n\n};\n\n\nclass CenterTableCountTask: public tbb::task {\n  int is_continuation;\n  Points *points;\n  double *work_mem;\n  int stride;\npublic:\n  CenterTableCountTask(int s, Points *p, double *mem):\n    stride(s), points(p), work_mem(mem), is_continuation(0){} \n\n  task *execute() {\n    tbb::task_list list;\n    int p;\n    \n    if(!is_continuation) {\n      recycle_as_continuation();\n      set_ref_count(NUM_DIVISIONS);\n\n      for(p = 1; p < (NUM_DIVISIONS); p++ ) \n\t  list.push_back( *new( allocate_child() ) CenterTableCount(p, stride, points, work_mem));\n      CenterTableCount &me = *new( allocate_child() ) CenterTableCount(0, stride, points, work_mem);\n      spawn(list);\n      is_continuation = 1;\n      \n      return &me;\n\n    }else {\n      /* continuation part */\n      int accum = 0;\n      for( int p = 0; p < (NUM_DIVISIONS); p++ ) {\n\tint tmp = (int)work_mem[p*stride];\n\twork_mem[p*stride] = accum;\n\taccum += tmp;\n      }\n      //fprintf(stderr,\"Accum = %d\\n\",accum);\n      return NULL;\n    }\n  }\n};\n\n\nclass FixCenter: public tbb::task {\n  Points *points;\n  double *work_mem;\n  int pid;\n  int stride;\npublic:\n  FixCenter(int id, int s, Points *p, double *mem):\n    pid(id),stride(s),points(p),work_mem(mem){}\n  task *execute(){\n#ifdef SERIAL_FIXCENTER\n    long k1 = 0;\n    long k2 = points->num;\n#else    \n    long bsize = points->num/((NUM_DIVISIONS));\n    long k1 = bsize * pid;\n    long k2 = k1 + bsize;\n    if( pid == (NUM_DIVISIONS)-1 ) k2 = points->num;\n#endif\n    /*fprintf(stderr,\"\\t[FixCenter]: pid=%d stride=%d from %d to %d is_center=0x%08x\\n\",\n      pid, stride, k1, k2,(int)is_center);  */\n    \n    for( int i = k1; i < k2; i++ ) {\n      if( is_center[i] ) {\n\tcenter_table[i] += (int)work_mem[pid*stride];\n\t//fprintf(stderr,\"\\tcenter_table[%d] = %d\\n\",i,center_table[i]);\n      }\n\n    }\n      //fprintf(stderr,\"PID %d done!\\n\",pid);\n    return NULL;\n\n  }\n};\n\nclass FixCenterTask: public tbb::task {\n  bool is_continuation;\n  Points *points;\n  double *work_mem;\n  int stride;\npublic:\n  FixCenterTask(int s, Points *p, double *mem):\n    stride(s), points(p), work_mem(mem), is_continuation(false){} \n\n  task *execute() {\n    tbb::task_list list;\n    int p;\n    if(!is_continuation) {\n      recycle_as_continuation();\n      set_ref_count(NUM_DIVISIONS);\n      for(p = 1; p < (NUM_DIVISIONS); p++ ) \n\t  list.push_back( *new( allocate_child() ) FixCenter(p, stride, points, work_mem));\n      spawn(list);\n      FixCenter &me = *new (allocate_child()) FixCenter(0, stride, points, work_mem);\n      is_continuation = true;\n      return &me;\n    }else {\n      /* coninuation */\n      return NULL;\n    }\n  }\n};\n\n\nclass LowerCost: public tbb::task {\n  Points *points;\n  double *work_mem;\n  long x;\n  int K;\n  int pid;\n  int stride;\npublic:\n  LowerCost(int id, int s, Points *p, long x_, double *mem, int k): \n    pid(id), stride(s), points(p), work_mem(mem), K(k), x(x_){}\n  task *execute() {\n\n    //my *lower* fields\n    double* lower = &work_mem[pid*stride];\n    double local_cost_of_opening_x = 0;\n    long bsize = points->num/((NUM_DIVISIONS)); //points->num/1;//((NUM_DIVISIONS));\n    long k1 = bsize * pid;\n    long k2 = k1 + bsize;\n    int i;\n\n    if( pid == (NUM_DIVISIONS)-1 ) \n      k2 = points->num;\n\n\n    /*fprintf(stderr,\"\\t[LowerCost]: pid=%d stride=%d from %d to %d\\n\",\n      pid, stride, k1, k2);  */\n    \n    double *cost_of_opening_x = &work_mem[pid*stride + K+1];\n\n    for ( i = k1; i < k2; i++ ) {\n      float x_cost = dist(points->p[i], points->p[x], points->dim) \n\t* points->p[i].weight;\n      float current_cost = points->p[i].cost;\n\n      //fprintf(stderr,\"\\t (x_cost=%lf < current_cost=%lf)\\n\",x_cost, current_cost);\n      if ( x_cost < current_cost ) {\n\n\t// point i would save cost just by switching to x\n\t// (note that i cannot be a median, \n\t// or else dist(p[i], p[x]) would be 0)\n\t\n\tswitch_membership[i] = 1;\n\tlocal_cost_of_opening_x += x_cost - current_cost;\n\t\n      } else {\n\t\n\t// cost of assigning i to x is at least current assignment cost of i\n\t\n\t// consider the savings that i's **current** median would realize\n\t// if we reassigned that median and all its members to x;\n\t// note we've already accounted for the fact that the median\n\t// would save z by closing; now we have to subtract from the savings\n\t// the extra cost of reassigning that median and its members \n\tint assign = points->p[i].assign;\n\tlower[center_table[assign]] += current_cost - x_cost;\n\t//fprintf(stderr,\"Lower[%d]=%lf\\n\",center_table[assign], lower[center_table[assign]]);\n      }\n    }\n    \n    *cost_of_opening_x = local_cost_of_opening_x;\n    return NULL;\n  }\n  \n  \n};\n  \nclass LowerCostTask: public tbb::task {\n  bool is_continuation;\n  Points *points;\n  double *work_mem;\n  int K;\n  long x;\n  int stride;\npublic:\n  LowerCostTask(int s, Points *p, long x_, double *mem, int k): \n    stride(s), points(p), work_mem(mem), K(k), x(x_), is_continuation(false){}\n\n  task *execute() {\n    tbb::task_list list;\n    int p;\n    if(!is_continuation) {\n      recycle_as_continuation();\n      set_ref_count(NUM_DIVISIONS);\n      for(p = 1; p < (NUM_DIVISIONS); p++ ) \n\t  list.push_back( *new( allocate_child() )  LowerCost(p, stride, points, x, work_mem, K));\n      spawn(list);\n      LowerCost &me = *new (allocate_child())  LowerCost(0, stride, points, x, work_mem, K);\n      is_continuation = true;\n      return &me;\n    }else {\n      /* continuation */\n      return NULL;\n    }\n  }\n};\n\n\n\n\nclass CenterClose: public tbb::task {\n  Points *points;\n  double *work_mem;\n  double *number_of_centers_to_close;\n  double z;\n  int pid, stride;\n  int K;\n\npublic:\n  CenterClose(int id, int s, Points *p, double *mem, int k, double z_): \n    pid(id),stride(s),points(p),work_mem(mem),K(k), z(z_){}\n\n  task *execute() {\n    double* gl_lower = &work_mem[(NUM_DIVISIONS)*stride];\n    double *cost_of_opening_x;\n    int local_number_of_centers_to_close = 0;\n    long bsize = points->num/((NUM_DIVISIONS)); //\n    long k1 = bsize * pid;\n    long k2 = k1 + bsize;\n\n    if( pid == (NUM_DIVISIONS)-1 ) \n      k2 = points->num;\n\n    /*fprintf(stderr,\"\\t[CenterClose]: pid=%d stride=%d from %d to %d\\n\",\n      pid, stride, k1, k2); */\n\n    number_of_centers_to_close = &work_mem[pid*stride + K];\n    cost_of_opening_x = &work_mem[pid*stride + K+1];\n    \n      for ( int i = k1; i < k2; i++ ) {\n\tif( is_center[i] ) {\n\t  double low = z;\n\t  //aggregate from all threads\n\t  for( int p = 0; p < (NUM_DIVISIONS); p++ ) {\n\t    low += work_mem[center_table[i]+p*stride];\n\t  }\n\t  gl_lower[center_table[i]] = low;\n\t  if ( low > 0 ) {\n\t    // i is a median, and\n\t    // if we were to open x (which we still may not) we'd close i\n\t    \n\t    // note, we'll ignore the following quantity unless we do open x\n\t    ++local_number_of_centers_to_close;  \n\t    *cost_of_opening_x -= low;\n\t  }\n\t}\n      }\n      *number_of_centers_to_close = (double)local_number_of_centers_to_close;\n      return NULL;\n  }\n\n};\n\n\nclass CenterCloseTask: public tbb::task {\n  bool is_continuation;\n  Points *points;\n  double *work_mem;\n  int stride;\n  double z;\n  int K;\npublic:\n  CenterCloseTask(int s, Points *p, double *mem, int k, double z_): \n    stride(s),points(p),work_mem(mem),K(k), z(z_), is_continuation(false){}\n\n  task *execute() {\n    tbb::task_list list;\n    int p;\n    if(!is_continuation) {\n      recycle_as_continuation();\n      set_ref_count(NUM_DIVISIONS);\n      for(p = 1; p < (NUM_DIVISIONS); p++ ) \n\tlist.push_back( *new( allocate_child() )  CenterClose(p, stride, points, work_mem, K, z));\n      spawn(list);\n      CenterClose &me = *new (allocate_child())  CenterClose(0, stride, points, work_mem, K, z);\n      is_continuation = true;\n      return &me;\n    }else {\n      /* coninuation */\n\n\n      return NULL;\n    }\n  }\n};\n\n\n\nclass SaveMoney: public tbb::task{\n  Points *points;\n  double *work_mem;\n  long x;\n  int pid, stride;\npublic:\n  SaveMoney(int id, int s, Points *p, long x_, double *mem): \n    pid(id), stride(s), points(p), x(x_), work_mem(mem){}\n  task *execute() {\n    double* gl_lower = &work_mem[(NUM_DIVISIONS)*stride];\n    long bsize = points->num/((NUM_DIVISIONS));//points->num/1;//((NUM_DIVISIONS));\n    long k1 = bsize * pid;\n    long k2 = k1 + bsize;\n    int i;\n    \n    if( pid == (NUM_DIVISIONS)-1 ) \n      k2 = points->num;\n\n    /*fprintf(stderr,\"\\t[SaveMoney]: pid=%d stride=%d from %d to %d\\n\",\n      pid, stride, k1, k2);   */\n    \n\n    //  we'd save money by opening x; we'll do it\n    for ( int i = k1; i < k2; i++ ) {\n      bool close_center = gl_lower[center_table[points->p[i].assign]] > 0 ;\n      if ( switch_membership[i] || close_center ) {\n\t// Either i's median (which may be i itself) is closing,\n\t// or i is closer to x than to its current median\n\tpoints->p[i].cost = points->p[i].weight *\n\t  dist(points->p[i], points->p[x], points->dim);\n\tpoints->p[i].assign = x;\n\t//fprintf(stderr,\"\\t[SaveMoney] %d: cost %lf, x=%d\\n\",i,points->p[i].cost, x);\n      }\n    }\n    for( int i = k1; i < k2; i++ ) {\n      if( is_center[i] && gl_lower[center_table[i]] > 0 ) {\n\tis_center[i] = false;\n      }\n    }\n    if( x >= k1 && x < k2 ) {\n      //fprintf(stderr,\"\\t-->is_center[%d]=true!\\n\",x);\n      is_center[x] = true;\n    }\n\n\n    return NULL;\n  }\n};\n\n\nclass SaveMoneyTask: public tbb::task {\n  bool is_continuation;\n  Points *points;\n  long x;\n  double* work_mem;\n  int stride;\n\npublic:\n  SaveMoneyTask(int s, Points *p, long x_, double *mem): \n    stride(s), points(p), x(x_), work_mem(mem) ,is_continuation(false){}\n\n\n  task *execute() {\n    tbb::task_list list;\n    int p;\n    if(!is_continuation) {\n      recycle_as_continuation();\n      set_ref_count(NUM_DIVISIONS);\n      for(p = 1; p < (NUM_DIVISIONS); p++ ) \n\tlist.push_back( *new( allocate_child() )  SaveMoney(p, stride, points, x, work_mem));\n      spawn(list);\n      SaveMoney &me = *new (allocate_child())  SaveMoney(0, stride, points, x, work_mem);\n      is_continuation = true;\n      return &me;\n    }else {\n      /* coninuation */\n\n\n      return NULL;\n    }\n  }\n};\n\n#endif //TBB_VERSION\n/********************************************/\n\n\n\nint isIdentical(float *i, float *j, int D)\n// tells whether two points of D dimensions are identical\n{\n  int a = 0;\n  int equal = 1;\n\n  while (equal && a < D) {\n    if (i[a] != j[a]) equal = 0;\n    else a++;\n  }\n  if (equal) return 1;\n  else return 0;\n\n}\n\n/* comparator for floating point numbers */\nstatic int floatcomp(const void *i, const void *j)\n{\n  float a, b;\n  a = *(float *)(i);\n  b = *(float *)(j);\n  if (a > b) return (1);\n  if (a < b) return (-1);\n  return(0);\n}\n\n/* shuffle points into random order */\nvoid shuffle(Points *points)\n{\n  long i, j;\n  Point temp;\n  for (i=0;i<points->num-1;i++) {\n    j=(lrand48()%(points->num - i)) + i;\n    temp = points->p[i];\n    points->p[i] = points->p[j];\n    points->p[j] = temp;\n  }\n}\n\n/* shuffle an array of integers */\nvoid intshuffle(int *intarray, int length)\n{\n  long i, j;\n  int temp;\n  for (i=0;i<length;i++) {\n    j=(lrand48()%(length - i))+i;\n    temp = intarray[i];\n    intarray[i]=intarray[j];\n    intarray[j]=temp;\n  }\n}\n\n/* compute Euclidean distance squared between two points */\nfloat dist(Point p1, Point p2, int dim)\n{\n  int i;\n  float result=0.0;\n  for (i=0;i<dim;i++)\n    result += (p1.coord[i] - p2.coord[i])*(p1.coord[i] - p2.coord[i]);\n  return(result);\n}\n\n#ifdef TBB_VERSION\n/* run speedy on the points, return total cost of solution */\nfloat pspeedy(Points *points, float z, long *kcenter)\n{\n  static double totalcost;\n  static bool open = false;\n  static double* costs; //cost for each thread. \n  static int i;\n\n\n  /* create center at first point, send it to itself */\n  {\n    int grain_size = points->num / ((NUM_DIVISIONS));\n    CenterCreate c(points);\n    tbb::parallel_for(tbb::blocked_range<int>(0,points->num, grain_size),c);\n  }\n    \n  *kcenter = 1;\n\n\n  {\n    int grain_size = points->num / ((NUM_DIVISIONS));\n    double acc_cost = 0.0;\n    CenterOpen c(points);\n    for(i = 1; i < points->num; i++ )  {\n      bool to_open = ((float)lrand48()/(float)INT_MAX)<(points->p[i].cost/z);\n      if( to_open )  {\n\t(*kcenter)++;\n\tc.i = i;\n\t//fprintf(stderr,\"** New center for i=%d\\n\",i);\n\ttbb::parallel_reduce(tbb::blocked_range<int>(0,points->num,grain_size),c);\n      }\n    }\n\n    c.type = 1; /* Once last time for actual reduction */\n    tbb::parallel_reduce(tbb::blocked_range<int>(0,points->num,grain_size),c);\n\n\n    totalcost =z*(*kcenter);\n    totalcost += c.getTotalCost();\n  }\n  return(totalcost);\n}\n\n#else //!TBB_VERSION\n\nfloat pspeedy(Points *points, float z, long *kcenter, int pid, pthread_barrier_t* barrier)\n{\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n  //my block\n  long bsize = points->num/nproc;\n  long k1 = bsize * pid;\n  long k2 = k1 + bsize;\n  if( pid == nproc-1 ) k2 = points->num;\n\n  static double totalcost;\n\n  static bool open = false;\n  static double* costs; //cost for each thread. \n  static int i;\n\n#ifdef ENABLE_THREADS\n  static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;\n  static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;\n#endif\n\n  /* create center at first point, send it to itself */\n  for( int k = k1; k < k2; k++ )    {\n    float distance = dist(points->p[k],points->p[0],points->dim);\n    points->p[k].cost = distance * points->p[k].weight;\n    points->p[k].assign=0;\n  }\n\n  if( pid==0 )   {\n    *kcenter = 1;\n    costs = (double*)malloc(sizeof(double)*nproc);\n  }\n\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n    \n  if( pid != 0 ) { // we are not the master threads. we wait until a center is opened.\n    while(1) {\n#ifdef ENABLE_THREADS\n      pthread_mutex_lock(&mutex);\n      while(!open) pthread_cond_wait(&cond,&mutex);\n      pthread_mutex_unlock(&mutex);\n#endif\n      if( i >= points->num ) break;\n      for( int k = k1; k < k2; k++ )\n\t{\n\t  float distance = dist(points->p[i],points->p[k],points->dim);\n\t  if( distance*points->p[k].weight < points->p[k].cost )\n\t    {\n\t      points->p[k].cost = distance * points->p[k].weight;\n\t      points->p[k].assign=i;\n\t    }\n\t}\n#ifdef ENABLE_THREADS\n      pthread_barrier_wait(barrier);\n      pthread_barrier_wait(barrier);\n#endif\n    } \n  }\n  else  { // I am the master thread. I decide whether to open a center and notify others if so. \n    for(i = 1; i < points->num; i++ )  {\n      bool to_open = ((float)lrand48()/(float)INT_MAX)<(points->p[i].cost/z);\n      if( to_open )  {\n\t(*kcenter)++;\n#ifdef ENABLE_THREADS\n\tpthread_mutex_lock(&mutex);\n#endif\n\topen = true;\n#ifdef ENABLE_THREADS\n\tpthread_mutex_unlock(&mutex);\n\tpthread_cond_broadcast(&cond);\n#endif\n\tfor( int k = k1; k < k2; k++ )  {\n\t  float distance = dist(points->p[i],points->p[k],points->dim);\n\t  if( distance*points->p[k].weight < points->p[k].cost )  {\n\t    points->p[k].cost = distance * points->p[k].weight;\n\t    points->p[k].assign=i;\n\t  }\n\t}\n#ifdef ENABLE_THREADS\n\tpthread_barrier_wait(barrier);\n#endif\n\topen = false;\n#ifdef ENABLE_THREADS\n\tpthread_barrier_wait(barrier);\n#endif\n      }\n    }\n#ifdef ENABLE_THREADS\n    pthread_mutex_lock(&mutex);\n#endif\n    open = true;\n#ifdef ENABLE_THREADS\n    pthread_mutex_unlock(&mutex);\n    pthread_cond_broadcast(&cond);\n#endif\n  }\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n  open = false;\n  double mytotal = 0;\n  for( int k = k1; k < k2; k++ )  {\n    mytotal += points->p[k].cost;\n  }\n  costs[pid] = mytotal;\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n  // aggregate costs from each thread\n  if( pid == 0 )\n    {\n      totalcost=z*(*kcenter);\n      for( int i = 0; i < nproc; i++ )\n\t{\n\t  totalcost += costs[i];\n\t} \n      free(costs);\n    }\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n\n  return(totalcost);\n}\n\n#endif // TBB_VERSION\n\n\n/* For a given point x, find the cost of the following operation:\n * -- open a facility at x if there isn't already one there,\n * -- for points y such that the assignment distance of y exceeds dist(y, x),\n *    make y a member of x,\n * -- for facilities y such that reassigning y and all its members to x \n *    would save cost, realize this closing and reassignment.\n * \n * If the cost of this operation is negative (i.e., if this entire operation\n * saves cost), perform this operation and return the amount of cost saved;\n * otherwise, do nothing.\n */\n\n/* numcenters will be updated to reflect the new number of centers */\n/* z is the facility cost, x is the number of this point in the array \n   points */\n\n\n#ifdef TBB_VERSION\ndouble pgain(long x, Points *points, double z, long int *numcenters)\n{\n  int i;\n  int number_of_centers_to_close = 0;\n\n  static double *work_mem;\n  static double gl_cost_of_opening_x;\n  static int gl_number_of_centers_to_close;\n\n  //each thread takes a block of working_mem.\n  int stride = *numcenters+2;\n\n  //make stride a multiple of CACHE_LINE\n  int cl = CACHE_LINE/sizeof(double);\n  if( stride % cl != 0 ) { \n    stride = cl * ( stride / cl + 1);\n  }\n  int K = stride -2 ; // K==*numcenters\n  \n  //my own cost of opening x\n  double cost_of_opening_x = 0;\n\n  work_mem = (double*) calloc(stride*((NUM_DIVISIONS)+1),sizeof(double));\n  \n  gl_cost_of_opening_x = 0;\n  gl_number_of_centers_to_close = 0;\n\n\n  /*For each center, we have a *lower* field that indicates \n    how much we will save by closing the center. \n    Each thread has its own copy of the *lower* fields as an array.\n    We first build a table to index the positions of the *lower* fields. \n  */\n\n  /*****  loopA() *****/\n  {\n    CenterTableCountTask &t = *new ( tbb::task::allocate_root() ) CenterTableCountTask(stride, points, work_mem);\n    tbb::task::spawn_root_and_wait(t);\n  }\n\n  \n  {\n    FixCenterTask &t = *new ( tbb::task::allocate_root() ) FixCenterTask(stride, points, work_mem);\n    tbb::task::spawn_root_and_wait(t);\n  }    \n\n  /***************/\n\n  //now we finish building the table. clear the working memory.\n  memset(switch_membership, 0, points->num*sizeof(bool));\n  memset(work_mem, 0, (NUM_DIVISIONS+1)*stride*sizeof(double));\n\n  /* loopB */\n  {\n    LowerCostTask &t = *new ( tbb::task::allocate_root() )  LowerCostTask(stride, points, x, work_mem, K);\n    tbb::task::spawn_root_and_wait(t);\n  }    \n\n  /* LoopC */\n  {\n    CenterCloseTask &t = *new ( tbb::task::allocate_root() )  CenterCloseTask(stride, points, work_mem, K, z);\n    tbb::task::spawn_root_and_wait(t);\n  }    \n\n\n  gl_cost_of_opening_x = z;\n  //aggregate\n  for( int p = 0; p < (NUM_DIVISIONS); p++ ) {\n    gl_number_of_centers_to_close += (int)work_mem[p*stride + K];\n    gl_cost_of_opening_x += work_mem[p*stride+K+1];\n  }\n\n  /*fprintf(stderr,\"\\tgl_number_of_centers_to_close = %d\\n\",gl_number_of_centers_to_close);\n    fprintf(stderr,\"\\tgl_cost_of_opening_x = %lf\\n\",gl_cost_of_opening_x); */\n\n\n  // Now, check whether opening x would save cost; if so, do it, and\n  // otherwise do nothing\n\n  if ( gl_cost_of_opening_x < 0 ) {\n\n    /* loopD */\n    SaveMoneyTask &t = *new ( tbb::task::allocate_root() )  SaveMoneyTask(stride, points, x, work_mem);\n    tbb::task::spawn_root_and_wait(t);\n\n\n    *numcenters = *numcenters + 1 - gl_number_of_centers_to_close;    \n  }\n  else {\n    gl_cost_of_opening_x = 0;  // the value we'll return\n  }\n\n  free(work_mem);\n\n  return -gl_cost_of_opening_x;\n}\n\n#else //!TBB_VERSION\n\n\ndouble pgain(long x, Points *points, double z, long int *numcenters, int pid, pthread_barrier_t* barrier)\n{\n  //  printf(\"pgain pthread %d begin\\n\",pid);\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n\n  //my block\n  long bsize = points->num/nproc;\n  long k1 = bsize * pid;\n  long k2 = k1 + bsize;\n  if( pid == nproc-1 ) k2 = points->num;\n\n  int i;\n  int number_of_centers_to_close = 0;\n\n  static double *work_mem;\n  static double gl_cost_of_opening_x;\n  static int gl_number_of_centers_to_close;\n\n  //each thread takes a block of working_mem.\n  int stride = *numcenters+2;\n  //make stride a multiple of CACHE_LINE\n  int cl = CACHE_LINE/sizeof(double);\n  if( stride % cl != 0 ) { \n    stride = cl * ( stride / cl + 1);\n  }\n  int K = stride -2 ; // K==*numcenters\n  \n  //my own cost of opening x\n  double cost_of_opening_x = 0;\n\n  if( pid==0 )    { \n    work_mem = (double*) malloc(stride*(nproc+1)*sizeof(double));\n    gl_cost_of_opening_x = 0;\n    gl_number_of_centers_to_close = 0;\n  }\n\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n  /*For each center, we have a *lower* field that indicates \n    how much we will save by closing the center. \n    Each thread has its own copy of the *lower* fields as an array.\n    We first build a table to index the positions of the *lower* fields. \n  */\n\n  int count = 0;\n  for( int i = k1; i < k2; i++ ) {\n    if( is_center[i] ) {\n      center_table[i] = count++;\n    }\n  }\n  work_mem[pid*stride] = count;\n\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n\n  if( pid == 0 ) {\n    int accum = 0;\n    for( int p = 0; p < nproc; p++ ) {\n      int tmp = (int)work_mem[p*stride];\n      work_mem[p*stride] = accum;\n      accum += tmp;\n    }\n  }\n\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n\n  for( int i = k1; i < k2; i++ ) {\n    if( is_center[i] ) {\n      center_table[i] += (int)work_mem[pid*stride];\n    }\n  }\n\n  //now we finish building the table. clear the working memory.\n  memset(switch_membership + k1, 0, (k2-k1)*sizeof(bool));\n  memset(work_mem+pid*stride, 0, stride*sizeof(double));\n  if( pid== 0 ) memset(work_mem+nproc*stride,0,stride*sizeof(double));\n\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n  \n  //my *lower* fields\n  double* lower = &work_mem[pid*stride];\n  //global *lower* fields\n  double* gl_lower = &work_mem[nproc*stride];\n\n  for ( i = k1; i < k2; i++ ) {\n    float x_cost = dist(points->p[i], points->p[x], points->dim) \n      * points->p[i].weight;\n    float current_cost = points->p[i].cost;\n\n    if ( x_cost < current_cost ) {\n\n      // point i would save cost just by switching to x\n      // (note that i cannot be a median, \n      // or else dist(p[i], p[x]) would be 0)\n      \n      switch_membership[i] = 1;\n      cost_of_opening_x += x_cost - current_cost;\n\n    } else {\n\n      // cost of assigning i to x is at least current assignment cost of i\n\n      // consider the savings that i's **current** median would realize\n      // if we reassigned that median and all its members to x;\n      // note we've already accounted for the fact that the median\n      // would save z by closing; now we have to subtract from the savings\n      // the extra cost of reassigning that median and its members \n      int assign = points->p[i].assign;\n      lower[center_table[assign]] += current_cost - x_cost;\n    }\n  }\n\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n\n  // at this time, we can calculate the cost of opening a center\n  // at x; if it is negative, we'll go through with opening it\n\n  for ( int i = k1; i < k2; i++ ) {\n    if( is_center[i] ) {\n      double low = z;\n      //aggregate from all threads\n      for( int p = 0; p < nproc; p++ ) {\n\tlow += work_mem[center_table[i]+p*stride];\n      }\n      gl_lower[center_table[i]] = low;\n      if ( low > 0 ) {\n\t// i is a median, and\n\t// if we were to open x (which we still may not) we'd close i\n\n\t// note, we'll ignore the following quantity unless we do open x\n\t++number_of_centers_to_close;  \n\tcost_of_opening_x -= low;\n      }\n    }\n  }\n  //use the rest of working memory to store the following\n  work_mem[pid*stride + K] = number_of_centers_to_close;\n  work_mem[pid*stride + K+1] = cost_of_opening_x;\n\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n  //  printf(\"thread %d cost complete\\n\",pid); \n\n  if( pid==0 ) {\n    gl_cost_of_opening_x = z;\n    //aggregate\n    for( int p = 0; p < nproc; p++ ) {\n      gl_number_of_centers_to_close += (int)work_mem[p*stride + K];\n      gl_cost_of_opening_x += work_mem[p*stride+K+1];\n    }\n  }\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n  // Now, check whether opening x would save cost; if so, do it, and\n  // otherwise do nothing\n\n  if ( gl_cost_of_opening_x < 0 ) {\n    //  we'd save money by opening x; we'll do it\n    for ( int i = k1; i < k2; i++ ) {\n      bool close_center = gl_lower[center_table[points->p[i].assign]] > 0 ;\n      if ( switch_membership[i] || close_center ) {\n\t// Either i's median (which may be i itself) is closing,\n\t// or i is closer to x than to its current median\n\tpoints->p[i].cost = points->p[i].weight *\n\t  dist(points->p[i], points->p[x], points->dim);\n\tpoints->p[i].assign = x;\n      }\n    }\n    for( int i = k1; i < k2; i++ ) {\n      if( is_center[i] && gl_lower[center_table[i]] > 0 ) {\n\tis_center[i] = false;\n      }\n    }\n    if( x >= k1 && x < k2 ) {\n      is_center[x] = true;\n    }\n\n    if( pid==0 ) {\n      *numcenters = *numcenters + 1 - gl_number_of_centers_to_close;\n    }\n  }\n  else {\n    if( pid==0 )\n      gl_cost_of_opening_x = 0;  // the value we'll return\n  }\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n  if( pid == 0 ) {\n    free(work_mem);\n    //    free(is_center);\n    //    free(switch_membership);\n    //    free(proc_cost_of_opening_x);\n    //    free(proc_number_of_centers_to_close);\n  }\n\n  return -gl_cost_of_opening_x;\n}\n\n#endif // TBB_VERSION\n\n\n\n/* facility location on the points using local search */\n/* z is the facility cost, returns the total cost and # of centers */\n/* assumes we are seeded with a reasonable solution */\n/* cost should represent this solution's cost */\n/* halt if there is < e improvement after iter calls to gain */\n/* feasible is an array of numfeasible points which may be centers */\n\n#ifdef TBB_VERSION\nfloat pFL(Points *points, int *feasible, int numfeasible,\n\t  double z, long *k, double cost, long iter, double e)\n{\n\n  long i;\n  long x;\n  double change;\n  long numberOfPoints;\n\n  change = cost;\n  /* continue until we run iter iterations without improvement */\n  /* stop instead if improvement is less than e */\n  while (change/cost > 1.0*e) {\n    change = 0.0;\n    numberOfPoints = points->num;\n    /* randomize order in which centers are considered */    \n    intshuffle(feasible, numfeasible);\n\n    for (i=0;i<iter;i++) {\n      x = i%numfeasible;\n      //fprintf(stderr,\"Iteration %d z=%lf, change=%lf\\n\",i,z,change);\n      change += pgain(feasible[x], points, z , k);\n      //fprintf(stderr,\"*** change: %lf, z=%lf\\n\",change,z);\n    }\n    cost -= change;\n  }\n\n  return(cost);\n}\n\n\n#else //!TBB_VERSION\n float pFL(Points *points, int *feasible, int numfeasible,\n\t  float z, long *k, double cost, long iter, float e, \n\t  int pid, pthread_barrier_t* barrier)\n{\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n  long i;\n  long x;\n  double change;\n  long numberOfPoints;\n\n  change = cost;\n  /* continue until we run iter iterations without improvement */\n  /* stop instead if improvement is less than e */\n  while (change/cost > 1.0*e) {\n    change = 0.0;\n    numberOfPoints = points->num;\n    /* randomize order in which centers are considered */\n\n    if( pid == 0 ) {\n      intshuffle(feasible, numfeasible);\n    }\n#ifdef ENABLE_THREADS\n    pthread_barrier_wait(barrier);\n#endif\n    for (i=0;i<iter;i++) {\n      x = i%numfeasible;\n      change += pgain(feasible[x], points, z, k, pid, barrier);\n    }\n    cost -= change;\n#ifdef ENABLE_THREADS\n    pthread_barrier_wait(barrier);\n#endif\n  }\n  return(cost);\n}\n\n#endif // TBB_VERSION\n\n\n\n#ifdef TBB_VERSION\nint selectfeasible_fast(Points *points, int **feasible, int kmin)\n#else\nint selectfeasible_fast(Points *points, int **feasible, int kmin, int pid, pthread_barrier_t* barrier)\n#endif\n{\n  int numfeasible = points->num;\n  if (numfeasible > (ITER*kmin*log((double)kmin)))\n    numfeasible = (int)(ITER*kmin*log((double)kmin));\n  *feasible = (int *)malloc(numfeasible*sizeof(int));\n  \n  float* accumweight;\n  float totalweight;\n\n  /* \n     Calcuate my block. \n     For now this routine does not seem to be the bottleneck, so it is not parallelized. \n     When necessary, this can be parallelized by setting k1 and k2 to \n     proper values and calling this routine from all threads ( it is called only\n     by thread 0 for now ). \n     Note that when parallelized, the randomization might not be the same and it might\n     not be difficult to measure the parallel speed-up for the whole program. \n   */\n  //  long bsize = numfeasible;\n  long k1 = 0;\n  long k2 = numfeasible;\n\n  float w;\n  int l,r,k;\n\n  /* not many points, all will be feasible */\n  if (numfeasible == points->num) {\n    for (int i=k1;i<k2;i++)\n      (*feasible)[i] = i;\n    return numfeasible;\n  }\n#ifdef TBB_VERSION\n  accumweight= (float*)memoryFloat.allocate(sizeof(float)*points->num);\n#else\n  accumweight= (float*)malloc(sizeof(float)*points->num);\n#endif\n\n  accumweight[0] = points->p[0].weight;\n  totalweight=0;\n  for( int i = 1; i < points->num; i++ ) {\n    accumweight[i] = accumweight[i-1] + points->p[i].weight;\n  }\n  totalweight=accumweight[points->num-1];\n\n  for(int i=k1; i<k2; i++ ) {\n    w = (lrand48()/(float)INT_MAX)*totalweight;\n    //binary search\n    l=0;\n    r=points->num-1;\n    if( accumweight[0] > w )  { \n      (*feasible)[i]=0; \n      continue;\n    }\n    while( l+1 < r ) {\n      k = (l+r)/2;\n      if( accumweight[k] > w ) {\n\tr = k;\n      } \n      else {\n\tl=k;\n      }\n    }\n    (*feasible)[i]=r;\n  }\n\n#ifdef TBB_VERSION\n  memoryFloat.deallocate(accumweight, sizeof(float));\n#else\n  free(accumweight); \n#endif\n\n  return numfeasible;\n}\n\n\n\n#ifdef TBB_VERSION\n/* compute approximate kmedian on the points */\nfloat pkmedian(Points *points, long kmin, long kmax, long* kfinal,\n\t       int pid, pthread_barrier_t* barrier )\n{\n  int i;\n  double cost;\n  double lastcost;\n  double hiz, loz, z;\n\n  static long k;\n  static int *feasible;\n  static int numfeasible;\n  static double* hizs;\n\n\n  //  hizs = (double*)calloc(nproc,sizeof(double));\n  hiz = loz = 0.0;\n  long numberOfPoints = points->num;\n  long ptDimension = points->dim;\n\n  //my block\n  long bsize = points->num/nproc;\n  long k1 = bsize * pid;\n  long k2 = k1 + bsize;\n  if( pid == nproc-1 ) k2 = points->num;\n\n  \n  //fprintf(stderr,\"Starting Kmedian procedure\\n\");\n  //fprintf(stderr,\"%i points in %i dimensions\\n\", numberOfPoints, ptDimension);\n\n  int grain_size = points->num / ((NUM_DIVISIONS));\n  if(grain_size==0)\n    {\n      \n      for (long kk=0;kk < points->num; kk++ ) \n\t{\n\t  hiz += dist(points->p[kk], points->p[0],\n\t\t      ptDimension)*points->p[kk].weight;\n\t}\n      \n    }\n  else {\n    HizReduction h(points);\n    tbb::parallel_reduce(tbb::blocked_range<int>(0,points->num, grain_size), h);\n    hiz = h.getHiz();\n  }\n\n  loz=0.0; z = (hiz+loz)/2.0;\n\n  /* NEW: Check whether more centers than points! */\n  if (points->num <= kmax) {\n    /* just return all points as facilities */\n      for (long kk=0;kk<points->num;kk++) \n\t{\n\t  points->p[kk].assign = kk;\n\t  points->p[kk].cost = 0;\n\t}\n    \n    cost = 0;\n    *kfinal = k;\n\n    return cost;\n  }\n\n    shuffle(points);\n    cost = pspeedy(points, z, &k);\n\n    i=0;\n\n  /* give speedy SP chances to get at least kmin/2 facilities */\n  while ((k < kmin)&&(i<SP)) {\n    cost = pspeedy(points, z, &k);\n    i++;\n  }\n\n  /* if still not enough facilities, assume z is too high */\n  while (k < kmin) {\n    if (i >= SP) \n      {hiz=z; z=(hiz+loz)/2.0; i=0;}\n    \n    shuffle(points);\n    cost =  pspeedy(points, z, &k);\n    i++;\n  }\n\n  /* now we begin the binary search for real */\n  /* must designate some points as feasible centers */\n  /* this creates more consistancy between FL runs */\n  /* helps to guarantee correct # of centers at the end */\n\n    numfeasible = selectfeasible_fast(points,&feasible,kmin);\n    for( int i = 0; i< points->num; i++ ) {\n      //fprintf(stderr,\"\\t-->is_center[%d]=true!\\n\",points->p[i].assign);\n      is_center[points->p[i].assign]= true;\n    }\n\n\n  while(1) {\n    /* first get a rough estimate on the FL solution */\n    lastcost = cost;\n    cost = pFL(points, feasible, numfeasible,\n\t       z, &k, cost, (long)(ITER*kmax*log((double)kmax)), 0.1);\n\n    /* if number of centers seems good, try a more accurate FL */\n    if (((k <= (1.1)*kmax)&&(k >= (0.9)*kmin))||\n\t((k <= kmax+2)&&(k >= kmin-2))) {\n      \n      /* may need to run a little longer here before halting without\n\t improvement */\n      cost = pFL(points, feasible, numfeasible,\n\t\t z, &k, cost, (long)(ITER*kmax*log((double)kmax)), 0.001);\n    }\n\n    if (k > kmax) {\n      /* facilities too cheap */\n      /* increase facility cost and up the cost accordingly */\n      loz = z; z = (hiz+loz)/2.0;\n      cost += (z-loz)*k;\n    }\n    if (k < kmin) {\n      /* facilities too expensive */\n      /* decrease facility cost and reduce the cost accordingly */\n      hiz = z; z = (hiz+loz)/2.0;\n      cost += (z-hiz)*k;\n    }\n\n    /* if k is good, return the result */\n    /* if we're stuck, just give up and return what we have */\n    if (((k <= kmax)&&(k >= kmin))||((loz >= (0.999)*hiz)) )\n      { \n\tbreak;\n      }\n\n  }\n\n  //  fprintf(stderr,\"Cleaning up...\\n\");\n  //clean up...\n  free(feasible); \n  *kfinal = k;\n\n  return cost;\n}\n\n\n#else //!TBB_VERSION\n\n/* compute approximate kmedian on the points */\nfloat pkmedian(Points *points, long kmin, long kmax, long* kfinal,\n\t       int pid, pthread_barrier_t* barrier )\n{\n  int i;\n  double cost;\n  double lastcost;\n  double hiz, loz, z;\n\n  static long k;\n  static int *feasible;\n  static int numfeasible;\n  static double* hizs;\n\n  if( pid==0 ) hizs = (double*)calloc(nproc,sizeof(double));\n  hiz = loz = 0.0;\n  long numberOfPoints = points->num;\n  long ptDimension = points->dim;\n\n  //my block\n  long bsize = points->num/nproc;\n  long k1 = bsize * pid;\n  long k2 = k1 + bsize;\n  if( pid == nproc-1 ) k2 = points->num;\n\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n\n  double myhiz = 0;\n  for (long kk=k1;kk < k2; kk++ ) {\n    myhiz += dist(points->p[kk], points->p[0],\n\t\t      ptDimension)*points->p[kk].weight;\n  }\n  hizs[pid] = myhiz;\n\n#ifdef ENABLE_THREADS  \n  pthread_barrier_wait(barrier);\n#endif\n\n  for( int i = 0; i < nproc; i++ )   {\n    hiz += hizs[i];\n  }\n\n  loz=0.0; z = (hiz+loz)/2.0;\n  /* NEW: Check whether more centers than points! */\n  if (points->num <= kmax) {\n    /* just return all points as facilities */\n    for (long kk=k1;kk<k2;kk++) {\n      points->p[kk].assign = kk;\n      points->p[kk].cost = 0;\n    }\n    cost = 0;\n    if( pid== 0 ) {\n      free(hizs); \n      *kfinal = k;\n    }\n    return cost;\n  }\n\n  if( pid == 0 ) shuffle(points);\n  cost = pspeedy(points, z, &k, pid, barrier);\n\n  i=0;\n  /* give speedy SP chances to get at least kmin/2 facilities */\n  while ((k < kmin)&&(i<SP)) {\n    cost = pspeedy(points, z, &k, pid, barrier);\n    i++;\n  }\n\n  /* if still not enough facilities, assume z is too high */\n  while (k < kmin) {\n    if (i >= SP) {hiz=z; z=(hiz+loz)/2.0; i=0;}\n    if( pid == 0 ) shuffle(points);\n    cost = pspeedy(points, z, &k, pid, barrier);\n    i++;\n  }\n\n  /* now we begin the binary search for real */\n  /* must designate some points as feasible centers */\n  /* this creates more consistancy between FL runs */\n  /* helps to guarantee correct # of centers at the end */\n  \n  if( pid == 0 )\n    {\n      numfeasible = selectfeasible_fast(points,&feasible,kmin,pid,barrier);\n      for( int i = 0; i< points->num; i++ ) {\n\tis_center[points->p[i].assign]= true;\n      }\n    }\n\n#ifdef ENABLE_THREADS\n  pthread_barrier_wait(barrier);\n#endif\n\n  while(1) {\n    /* first get a rough estimate on the FL solution */\n    lastcost = cost;\n    cost = pFL(points, feasible, numfeasible,\n\t       z, &k, cost, (long)(ITER*kmax*log((double)kmax)), 0.1, pid, barrier);\n\n    /* if number of centers seems good, try a more accurate FL */\n    if (((k <= (1.1)*kmax)&&(k >= (0.9)*kmin))||\n\t((k <= kmax+2)&&(k >= kmin-2))) {\n\n      /* may need to run a little longer here before halting without\n\t improvement */\n      cost = pFL(points, feasible, numfeasible,\n\t\t z, &k, cost, (long)(ITER*kmax*log((double)kmax)), 0.001, pid, barrier);\n    }\n\n    if (k > kmax) {\n      /* facilities too cheap */\n      /* increase facility cost and up the cost accordingly */\n      loz = z; z = (hiz+loz)/2.0;\n      cost += (z-loz)*k;\n    }\n    if (k < kmin) {\n      /* facilities too expensive */\n      /* decrease facility cost and reduce the cost accordingly */\n      hiz = z; z = (hiz+loz)/2.0;\n      cost += (z-hiz)*k;\n    }\n\n    /* if k is good, return the result */\n    /* if we're stuck, just give up and return what we have */\n    if (((k <= kmax)&&(k >= kmin))||((loz >= (0.999)*hiz)) )\n      { \n\tbreak;\n      }\n#ifdef ENABLE_THREADS\n    pthread_barrier_wait(barrier);\n#endif\n  }\n\n  //clean up...\n  if( pid==0 ) {\n    free(feasible); \n    free(hizs);\n    *kfinal = k;\n  }\n\n  return cost;\n}\n\n#endif // TBB_VERSION\n\n\n\n\n/* compute the means for the k clusters */\nint contcenters(Points *points)\n{\n  long i, ii;\n  float relweight;\n\n  for (i=0;i<points->num;i++) {\n    /* compute relative weight of this point to the cluster */\n    if (points->p[i].assign != i) {\n      relweight=points->p[points->p[i].assign].weight + points->p[i].weight;\n      relweight = points->p[i].weight/relweight;\n      for (ii=0;ii<points->dim;ii++) {\n\tpoints->p[points->p[i].assign].coord[ii]*=1.0-relweight;\n\tpoints->p[points->p[i].assign].coord[ii]+=\n\t  points->p[i].coord[ii]*relweight;\n      }\n      points->p[points->p[i].assign].weight += points->p[i].weight;\n    }\n  }\n  \n  return 0;\n}\n\n/* copy centers from points to centers */\nvoid copycenters(Points *points, Points* centers, long* centerIDs, long offset)\n{\n  long i;\n  long k;\n\n  bool *is_a_median = (bool *) calloc(points->num, sizeof(bool));\n\n  /* mark the centers */\n  for ( i = 0; i < points->num; i++ ) {\n    is_a_median[points->p[i].assign] = 1;\n  }\n\n  k=centers->num;\n\n  /* count how many  */\n  for ( i = 0; i < points->num; i++ ) {\n    if ( is_a_median[i] ) {\n      memcpy( centers->p[k].coord, points->p[i].coord, points->dim * sizeof(float));\n      centers->p[k].weight = points->p[i].weight;\n      centerIDs[k] = i + offset;\n      k++;\n    }\n  }\n\n  centers->num = k;\n\n  free(is_a_median);\n}\n\nstruct pkmedian_arg_t\n{\n  Points* points;\n  long kmin;\n  long kmax;\n  long* kfinal;\n  int pid;\n  pthread_barrier_t* barrier;\n};\n\nvoid* localSearchSub(void* arg_) {\n\n  pkmedian_arg_t* arg= (pkmedian_arg_t*)arg_;\n  pkmedian(arg->points,arg->kmin,arg->kmax,arg->kfinal,arg->pid,arg->barrier);\n\n  return NULL;\n}\n\n#ifdef TBB_VERSION\nvoid localSearch( Points* points, long kmin, long kmax, long* kfinal ) {\n  pkmedian_arg_t arg;\n  arg.points = points;\n  arg.kmin = kmin;\n  arg.kmax = kmax;\n  arg.pid = 0;\n  arg.kfinal = kfinal;\n  localSearchSub(&arg);\n}\n#else //!TBB_VERSION\n\nvoid localSearch( Points* points, long kmin, long kmax, long* kfinal ) {\n    pthread_barrier_t barrier;\n    pthread_t* threads = new pthread_t[nproc];\n    pkmedian_arg_t* arg = new pkmedian_arg_t[nproc];\n\n#ifdef ENABLE_THREADS\n    pthread_barrier_init(&barrier,NULL,nproc);\n#endif\n    for( int i = 0; i < nproc; i++ ) {\n      arg[i].points = points;\n      arg[i].kmin = kmin;\n      arg[i].kmax = kmax;\n      arg[i].pid = i;\n      arg[i].kfinal = kfinal;\n\n      arg[i].barrier = &barrier;\n#ifdef ENABLE_THREADS\n      pthread_create(threads+i,NULL,localSearchSub,(void*)&arg[i]);\n#else\n      localSearchSub(&arg[0]);\n#endif\n    }\n\n#ifdef ENABLE_THREADS\n    for ( int i = 0; i < nproc; i++) {\n      pthread_join(threads[i],NULL);\n    }\n#endif\n\n    delete[] threads;\n    delete[] arg;\n#ifdef ENABLE_THREADS\n    pthread_barrier_destroy(&barrier);\n#endif\n}\n#endif // TBB_VERSION\n\n\nclass PStream {\npublic:\n  virtual size_t read( float* dest, int dim, int num ) = 0;\n  virtual int ferror() = 0;\n  virtual int feof() = 0;\n  virtual ~PStream() {\n  }\n};\n\n//synthetic stream\nclass SimStream : public PStream {\npublic:\n  SimStream(long n_ ) {\n    n = n_;\n  }\n  size_t read( float* dest, int dim, int num ) {\n    size_t count = 0;\n    for( int i = 0; i < num && n > 0; i++ ) {\n      for( int k = 0; k < dim; k++ ) {\n\tdest[i*dim + k] = lrand48()/(float)INT_MAX;\n      }\n      n--;\n      count++;\n    }\n    return count;\n  }\n  int ferror() {\n    return 0;\n  }\n  int feof() {\n    return n <= 0;\n  }\n  ~SimStream() { \n  }\nprivate:\n  long n;\n};\n\nclass FileStream : public PStream {\npublic:\n  FileStream(char* filename) {\n    fp = fopen( filename, \"rb\");\n    if( fp == NULL ) {\n      fprintf(stderr,\"error opening file %s\\n.\",filename);\n      exit(1);\n    }\n  }\n  size_t read( float* dest, int dim, int num ) {\n    return std::fread(dest, sizeof(float)*dim, num, fp); \n  }\n  int ferror() {\n    return std::ferror(fp);\n  }\n  int feof() {\n    return std::feof(fp);\n  }\n  ~FileStream() {\n    fprintf(stderr,\"closing file stream\\n\");\n    fclose(fp);\n  }\nprivate:\n  FILE* fp;\n};\n\nvoid outcenterIDs( Points* centers, long* centerIDs, char* outfile ) {\n  FILE* fp = fopen(outfile, \"w\");\n  if( fp==NULL ) {\n    fprintf(stderr, \"error opening %s\\n\",outfile);\n    exit(1);\n  }\n  int* is_a_median = (int*)calloc( sizeof(int), centers->num );\n  for( int i =0 ; i< centers->num; i++ ) {\n    is_a_median[centers->p[i].assign] = 1;\n  }\n\n  for( int i = 0; i < centers->num; i++ ) {\n    if( is_a_median[i] ) {\n      fprintf(fp, \"%u\\n\", centerIDs[i]);\n      fprintf(fp, \"%lf\\n\", centers->p[i].weight);\n      for( int k = 0; k < centers->dim; k++ ) {\n\tfprintf(fp, \"%lf \", centers->p[i].coord[k]);\n      }\n      fprintf(fp,\"\\n\\n\");\n    }\n  }\n  fclose(fp);\n}\n\nvoid streamCluster( PStream* stream, \n\t\t    long kmin, long kmax, int dim,\n\t\t    long chunksize, long centersize, char* outfile )\n{\n\n#ifdef TBB_VERSION\n  float* block = (float*)memoryFloat.allocate( chunksize*dim*sizeof(float) );\n  float* centerBlock = (float*)memoryFloat.allocate(centersize*dim*sizeof(float) );\n  long* centerIDs = (long*)memoryLong.allocate(centersize*dim*sizeof(long));\n#else\n  float* block = (float*)malloc( chunksize*dim*sizeof(float) );\n  float* centerBlock = (float*)malloc(centersize*dim*sizeof(float) );\n  long* centerIDs = (long*)malloc(centersize*dim*sizeof(long));\n#endif\n\n  if( block == NULL ) { \n    fprintf(stderr,\"not enough memory for a chunk!\\n\");\n    exit(1);\n  }\n\n  Points points;\n  points.dim = dim;\n  points.num = chunksize;\n  points.p = \n#ifdef TBB_VERSION\n    (Point *)memoryPoint.allocate(chunksize*sizeof(Point), NULL);\n#else\n    (Point *)malloc(chunksize*sizeof(Point));\n#endif\n\n  for( int i = 0; i < chunksize; i++ ) {\n    points.p[i].coord = &block[i*dim];\n  }\n\n  Points centers;\n  centers.dim = dim;\n  centers.p = \n#ifdef TBB_VERSION\n    (Point *)memoryPoint.allocate(centersize*sizeof(Point), NULL);\n#else\n    (Point *)malloc(centersize*sizeof(Point));\n#endif\n  centers.num = 0;\n\n  for( int i = 0; i< centersize; i++ ) {\n    centers.p[i].coord = &centerBlock[i*dim];\n    centers.p[i].weight = 1.0;\n  }\n\n  long IDoffset = 0;\n  long kfinal;\n  while(1) {\n\n    size_t numRead  = stream->read(block, dim, chunksize ); \n    fprintf(stderr,\"read %d points\\n\",numRead);\n\n    if( stream->ferror() || numRead < (unsigned int)chunksize && !stream->feof() ) {\n      fprintf(stderr, \"error reading data!\\n\");\n      exit(1);\n    }\n\n    points.num = numRead;\n    for( int i = 0; i < points.num; i++ ) {\n      points.p[i].weight = 1.0;\n    }\n\n#ifdef TBB_VERSION\n    switch_membership = (bool*)memoryBool.allocate(points.num*sizeof(bool), NULL);\n    is_center = (bool*)calloc(points.num,sizeof(bool));\n    center_table = (int*)memoryInt.allocate(points.num*sizeof(int));\n#else\n    switch_membership = (bool*)malloc(points.num*sizeof(bool));\n    is_center = (bool*)calloc(points.num,sizeof(bool));\n    center_table = (int*)malloc(points.num*sizeof(int));\n#endif\n\n\n    //fprintf(stderr,\"center_table = 0x%08x\\n\",(int)center_table);\n    //fprintf(stderr,\"is_center = 0x%08x\\n\",(int)is_center);\n\n    localSearch(&points,kmin, kmax,&kfinal); // parallel\n\n    //fprintf(stderr,\"finish local search\\n\");\n    contcenters(&points); /* sequential */\n    if( kfinal + centers.num > centersize ) {\n      //here we don't handle the situation where # of centers gets too large. \n      fprintf(stderr,\"oops! no more space for centers\\n\");\n      exit(1);\n    }\n\n    copycenters(&points, &centers, centerIDs, IDoffset); /* sequential */\n    IDoffset += numRead;\n\n#ifdef TBB_VERSION\n    memoryBool.deallocate(switch_membership, sizeof(bool));\n    free(is_center);\n    memoryInt.deallocate(center_table, sizeof(int));\n#else\n    free(is_center);\n    free(switch_membership);\n    free(center_table);\n#endif\n\n    if( stream->feof() ) {\n      break;\n    }\n  }\n\n  //finally cluster all temp centers\n#ifdef TBB_VERSION\n  switch_membership = (bool*)memoryBool.allocate(centers.num*sizeof(bool));\n  is_center = (bool*)calloc(centers.num,sizeof(bool));\n  center_table = (int*)memoryInt.allocate(centers.num*sizeof(int));\n#else\n  switch_membership = (bool*)malloc(centers.num*sizeof(bool));\n  is_center = (bool*)calloc(centers.num,sizeof(bool));\n  center_table = (int*)malloc(centers.num*sizeof(int));\n#endif\n\n  localSearch( &centers, kmin, kmax ,&kfinal ); // parallel\n  contcenters(&centers);\n  outcenterIDs( &centers, centerIDs, outfile);\n}\n\nint main(int argc, char **argv)\n{\n  char *outfilename = new char[MAXNAMESIZE];\n  char *infilename = new char[MAXNAMESIZE];\n  long kmin, kmax, n, chunksize, clustersize;\n  int dim;\n\n#ifdef PARSEC_VERSION\n#define __PARSEC_STRING(x) #x\n#define __PARSEC_XSTRING(x) __PARSEC_STRING(x)\n        fprintf(stderr,\"PARSEC Benchmark Suite Version \"__PARSEC_XSTRING(PARSEC_VERSION)\"\\n\");\n\tfflush(NULL);\n#else\n        fprintf(stderr,\"PARSEC Benchmark Suite\\n\");\n\tfflush(NULL);\n#endif //PARSEC_VERSION\n#ifdef ENABLE_PARSEC_HOOKS\n  __parsec_bench_begin(__parsec_streamcluster);\n#endif\n\n  if (argc<10) {\n    fprintf(stderr,\"usage: %s k1 k2 d n chunksize clustersize infile outfile nproc\\n\",\n\t    argv[0]);\n    fprintf(stderr,\"  k1:          Min. number of centers allowed\\n\");\n    fprintf(stderr,\"  k2:          Max. number of centers allowed\\n\");\n    fprintf(stderr,\"  d:           Dimension of each data point\\n\");\n    fprintf(stderr,\"  n:           Number of data points\\n\");\n    fprintf(stderr,\"  chunksize:   Number of data points to handle per step\\n\");\n    fprintf(stderr,\"  clustersize: Maximum number of intermediate centers\\n\");\n    fprintf(stderr,\"  infile:      Input file (if n<=0)\\n\");\n    fprintf(stderr,\"  outfile:     Output file\\n\");\n    fprintf(stderr,\"  nproc:       Number of threads to use\\n\");\n    fprintf(stderr,\"\\n\");\n    fprintf(stderr, \"if n > 0, points will be randomly generated instead of reading from infile.\\n\");\n    exit(1);\n  }\n\n\n\n  kmin = atoi(argv[1]);\n  kmax = atoi(argv[2]);\n  dim = atoi(argv[3]);\n  n = atoi(argv[4]);\n  chunksize = atoi(argv[5]);\n  clustersize = atoi(argv[6]);\n  strcpy(infilename, argv[7]);\n  strcpy(outfilename, argv[8]);\n  nproc = atoi(argv[9]);\n\n\n#ifdef TBB_VERSION\n  fprintf(stderr,\"TBB version. Number of divisions: %d\\n\",NUM_DIVISIONS);\n  tbb::task_scheduler_init init(nproc);\n#endif\n\n\n  srand48(SEED);\n  PStream* stream;\n  if( n > 0 ) {\n    stream = new SimStream(n);\n  }\n  else {\n    stream = new FileStream(infilename);\n  }\n\n\n#ifdef ENABLE_PARSEC_HOOKS\n  __parsec_roi_begin();\n#endif\n\n  streamCluster(stream, kmin, kmax, dim, chunksize, clustersize, outfilename );\n\n#ifdef ENABLE_PARSEC_HOOKS\n  __parsec_roi_end();\n#endif\n\n  delete stream;\n\n#ifdef ENABLE_PARSEC_HOOKS\n  __parsec_bench_end();\n#endif\n  \n  return 0;\n}", "label": 2}
{"code": "// Copyright (c) 2007 Intel Corp.\n\n// Black-Scholes\n// Analytical method for calculating European Options\n//\n// \n// Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice \n// Hall, John C. Hull,\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <math.h>\n#include <string.h>\n\n#ifdef ENABLE_PARSEC_HOOKS\n#include <hooks.h>\n#endif\n\n// Multi-threaded pthreads header\n#ifdef ENABLE_THREADS\n// Add the following line so that icc 9.0 is compatible with pthread lib.\n#define __thread __threadp\nMAIN_ENV\n#undef __thread\n#endif\n\n// Multi-threaded OpenMP header\n#ifdef ENABLE_OPENMP\n#include <omp.h>\n#endif\n\n#ifdef ENABLE_TBB\n#include \"tbb/blocked_range.h\"\n#include \"tbb/parallel_for.h\"\n#include \"tbb/task_scheduler_init.h\"\n#include \"tbb/tick_count.h\"\n\nusing namespace std;\nusing namespace tbb;\n#endif //ENABLE_TBB\n\n// Multi-threaded header for Windows\n#ifdef WIN32\n#pragma warning(disable : 4305)\n#pragma warning(disable : 4244)\n#include <windows.h>\n#endif\n\n//Precision to use for calculations\n#define fptype float\n\n#define NUM_RUNS 100\n\ntypedef struct OptionData_ {\n        fptype s;          // spot price\n        fptype strike;     // strike price\n        fptype r;          // risk-free interest rate\n        fptype divq;       // dividend rate\n        fptype v;          // volatility\n        fptype t;          // time to maturity or option expiration in years \n                           //     (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc)  \n        char OptionType;   // Option type.  \"P\"=PUT, \"C\"=CALL\n        fptype divs;       // dividend vals (not used in this test)\n        fptype DGrefval;   // DerivaGem Reference Value\n} OptionData;\n\nOptionData *data;\nfptype *prices;\nint numOptions;\n\nint    * otype;\nfptype * sptprice;\nfptype * strike;\nfptype * rate;\nfptype * volatility;\nfptype * otime;\nint numError = 0;\nint nThreads;\n\n////////////////////////////////////////////////////////////////////////////////\n////////////////////////////////////////////////////////////////////////////////\n///////////////////////////////////////////////////////////////////////////////\n////////////////////////////////////////////////////////////////////////////////\n// Cumulative Normal Distribution Function\n// See Hull, Section 11.8, P.243-244\n#define inv_sqrt_2xPI 0.39894228040143270286\n\nfptype CNDF ( fptype InputX ) \n{\n    int sign;\n\n    fptype OutputX;\n    fptype xInput;\n    fptype xNPrimeofX;\n    fptype expValues;\n    fptype xK2;\n    fptype xK2_2, xK2_3;\n    fptype xK2_4, xK2_5;\n    fptype xLocal, xLocal_1;\n    fptype xLocal_2, xLocal_3;\n\n    // Check for negative value of InputX\n    if (InputX < 0.0) {\n        InputX = -InputX;\n        sign = 1;\n    } else \n        sign = 0;\n\n    xInput = InputX;\n \n    // Compute NPrimeX term common to both four & six decimal accuracy calcs\n    expValues = exp(-0.5f * InputX * InputX);\n    xNPrimeofX = expValues;\n    xNPrimeofX = xNPrimeofX * inv_sqrt_2xPI;\n\n    xK2 = 0.2316419 * xInput;\n    xK2 = 1.0 + xK2;\n    xK2 = 1.0 / xK2;\n    xK2_2 = xK2 * xK2;\n    xK2_3 = xK2_2 * xK2;\n    xK2_4 = xK2_3 * xK2;\n    xK2_5 = xK2_4 * xK2;\n    \n    xLocal_1 = xK2 * 0.319381530;\n    xLocal_2 = xK2_2 * (-0.356563782);\n    xLocal_3 = xK2_3 * 1.781477937;\n    xLocal_2 = xLocal_2 + xLocal_3;\n    xLocal_3 = xK2_4 * (-1.821255978);\n    xLocal_2 = xLocal_2 + xLocal_3;\n    xLocal_3 = xK2_5 * 1.330274429;\n    xLocal_2 = xLocal_2 + xLocal_3;\n\n    xLocal_1 = xLocal_2 + xLocal_1;\n    xLocal   = xLocal_1 * xNPrimeofX;\n    xLocal   = 1.0 - xLocal;\n\n    OutputX  = xLocal;\n    \n    if (sign) {\n        OutputX = 1.0 - OutputX;\n    }\n    \n    return OutputX;\n} \n\n//////////////////////////////////////////////////////////////////////////////////////\n//////////////////////////////////////////////////////////////////////////////////////\n//////////////////////////////////////////////////////////////////////////////////////\n//////////////////////////////////////////////////////////////////////////////////////\nfptype BlkSchlsEqEuroNoDiv( fptype sptprice,\n                            fptype strike, fptype rate, fptype volatility,\n                            fptype time, int otype, float timet )\n{\n    fptype OptionPrice;\n\n    // local private working variables for the calculation\n    fptype xStockPrice;\n    fptype xStrikePrice;\n    fptype xRiskFreeRate;\n    fptype xVolatility;\n    fptype xTime;\n    fptype xSqrtTime;\n\n    fptype logValues;\n    fptype xLogTerm;\n    fptype xD1; \n    fptype xD2;\n    fptype xPowerTerm;\n    fptype xDen;\n    fptype d1;\n    fptype d2;\n    fptype FutureValueX;\n    fptype NofXd1;\n    fptype NofXd2;\n    fptype NegNofXd1;\n    fptype NegNofXd2;    \n    \n    xStockPrice = sptprice;\n    xStrikePrice = strike;\n    xRiskFreeRate = rate;\n    xVolatility = volatility;\n\n    xTime = time;\n    xSqrtTime = sqrt(xTime);\n\n    logValues = log( sptprice / strike );\n        \n    xLogTerm = logValues;\n        \n    \n    xPowerTerm = xVolatility * xVolatility;\n    xPowerTerm = xPowerTerm * 0.5;\n        \n    xD1 = xRiskFreeRate + xPowerTerm;\n    xD1 = xD1 * xTime;\n    xD1 = xD1 + xLogTerm;\n\n    xDen = xVolatility * xSqrtTime;\n    xD1 = xD1 / xDen;\n    xD2 = xD1 -  xDen;\n\n    d1 = xD1;\n    d2 = xD2;\n    \n    NofXd1 = CNDF( d1 );\n    NofXd2 = CNDF( d2 );\n\n    FutureValueX = strike * ( exp( -(rate)*(time) ) );        \n    if (otype == 0) {            \n        OptionPrice = (sptprice * NofXd1) - (FutureValueX * NofXd2);\n    } else { \n        NegNofXd1 = (1.0 - NofXd1);\n        NegNofXd2 = (1.0 - NofXd2);\n        OptionPrice = (FutureValueX * NegNofXd2) - (sptprice * NegNofXd1);\n    }\n    \n    return OptionPrice;\n}\n\n#ifdef ENABLE_TBB\nstruct mainWork {\n  mainWork() {}\n  mainWork(mainWork &w, tbb::split) {}\n\n  void operator()(const tbb::blocked_range<int> &range) const {\n    fptype price;\n    int begin = range.begin();\n    int end = range.end();\n\n    for (int i=begin; i!=end; i++) {\n      /* Calling main function to calculate option value based on \n       * Black & Scholes's equation.\n       */\n\n      price = BlkSchlsEqEuroNoDiv( sptprice[i], strike[i],\n                                   rate[i], volatility[i], otime[i], \n                                   otype[i], 0);\n      prices[i] = price;\n\n#ifdef ERR_CHK \n      fptype priceDelta = data[i].DGrefval - price;\n      if( fabs(priceDelta) >= 1e-5 ){\n        fprintf(stderr,\"Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\\n\",\n               i, price, data[i].DGrefval, priceDelta);\n        numError ++;\n      }\n#endif\n    }\n  }\n};\n\n#endif // ENABLE_TBB\n\n//////////////////////////////////////////////////////////////////////////////////////\n//////////////////////////////////////////////////////////////////////////////////////\n//////////////////////////////////////////////////////////////////////////////////////\n//////////////////////////////////////////////////////////////////////////////////////\n\n#ifdef ENABLE_TBB\nint bs_thread(void *tid_ptr) {\n    int j;\n    tbb::affinity_partitioner a;\n\n    mainWork doall;\n    for (j=0; j<NUM_RUNS; j++) {\n      tbb::parallel_for(tbb::blocked_range<int>(0, numOptions), doall, a);\n    }\n\n    return 0;\n}\n#else // !ENABLE_TBB\n\n#ifdef WIN32\nDWORD WINAPI bs_thread(LPVOID tid_ptr){\n#else\nint bs_thread(void *tid_ptr) {\n#endif\n    int i, j;\n    fptype price;\n    fptype priceDelta;\n    int tid = *(int *)tid_ptr;\n    int start = tid * (numOptions / nThreads);\n    int end = start + (numOptions / nThreads);\n\n    for (j=0; j<NUM_RUNS; j++) {\n#ifdef ENABLE_OPENMP\n#pragma omp parallel for private(i, price, priceDelta)\n        for (i=0; i<numOptions; i++) {\n#else  //ENABLE_OPENMP\n        for (i=start; i<end; i++) {\n#endif //ENABLE_OPENMP\n            /* Calling main function to calculate option value based on \n             * Black & Scholes's equation.\n             */\n            price = BlkSchlsEqEuroNoDiv( sptprice[i], strike[i],\n                                         rate[i], volatility[i], otime[i], \n                                         otype[i], 0);\n            prices[i] = price;\n\n#ifdef ERR_CHK\n            priceDelta = data[i].DGrefval - price;\n            if( fabs(priceDelta) >= 1e-4 ){\n                printf(\"Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\\n\",\n                       i, price, data[i].DGrefval, priceDelta);\n                numError ++;\n            }\n#endif\n        }\n    }\n\n    return 0;\n}\n#endif //ENABLE_TBB\n\nint main (int argc, char **argv)\n{\n    FILE *file;\n    int i;\n    int loopnum;\n    fptype * buffer;\n    int * buffer2;\n    int rv;\n\n#ifdef PARSEC_VERSION\n#define __PARSEC_STRING(x) #x\n#define __PARSEC_XSTRING(x) __PARSEC_STRING(x)\n        printf(\"PARSEC Benchmark Suite Version \"__PARSEC_XSTRING(PARSEC_VERSION)\"\\n\");\n\tfflush(NULL);\n#else\n        printf(\"PARSEC Benchmark Suite\\n\");\n\tfflush(NULL);\n#endif //PARSEC_VERSION\n#ifdef ENABLE_PARSEC_HOOKS\n   __parsec_bench_begin(__parsec_blackscholes);\n#endif\n\n   if (argc != 4)\n        {\n                printf(\"Usage:\\n\\t%s <nthreads> <inputFile> <outputFile>\\n\", argv[0]);\n                exit(1);\n        }\n    nThreads = atoi(argv[1]);\n    char *inputFile = argv[2];\n    char *outputFile = argv[3];\n\n    //Read input data from file\n    file = fopen(inputFile, \"r\");\n    if(file == NULL) {\n      printf(\"ERROR: Unable to open file `%s'.\\n\", inputFile);\n      exit(1);\n    }\n    rv = fscanf(file, \"%i\", &numOptions);\n    if(rv != 1) {\n      printf(\"ERROR: Unable to read from file `%s'.\\n\", inputFile);\n      fclose(file);\n      exit(1);\n    }\n    if(nThreads > numOptions) {\n      printf(\"WARNING: Not enough work, reducing number of threads to match number of options.\\n\");\n      nThreads = numOptions;\n    }\n\n#if !defined(ENABLE_THREADS) && !defined(ENABLE_OPENMP) && !defined(ENABLE_TBB)\n    if(nThreads != 1) {\n        printf(\"Error: <nthreads> must be 1 (serial version)\\n\");\n        exit(1);\n    }\n#endif\n\n    // alloc spaces for the option data\n    data = (OptionData*)malloc(numOptions*sizeof(OptionData));\n    prices = (fptype*)malloc(numOptions*sizeof(fptype));\n    for ( loopnum = 0; loopnum < numOptions; ++ loopnum )\n    {\n        rv = fscanf(file, \"%f %f %f %f %f %f %c %f %f\", &data[loopnum].s, &data[loopnum].strike, &data[loopnum].r, &data[loopnum].divq, &data[loopnum].v, &data[loopnum].t, &data[loopnum].OptionType, &data[loopnum].divs, &data[loopnum].DGrefval);\n        if(rv != 9) {\n          printf(\"ERROR: Unable to read from file `%s'.\\n\", inputFile);\n          fclose(file);\n          exit(1);\n        }\n    }\n    rv = fclose(file);\n    if(rv != 0) {\n      printf(\"ERROR: Unable to close file `%s'.\\n\", inputFile);\n      exit(1);\n    }\n\n#ifdef ENABLE_THREADS\n    MAIN_INITENV(,8000000,nThreads);\n#endif\n    printf(\"Num of Options: %d\\n\", numOptions);\n    printf(\"Num of Runs: %d\\n\", NUM_RUNS);\n\n#define PAD 256\n#define LINESIZE 64\n\n    buffer = (fptype *) malloc(5 * numOptions * sizeof(fptype) + PAD);\n    sptprice = (fptype *) (((unsigned long long)buffer + PAD) & ~(LINESIZE - 1));\n    strike = sptprice + numOptions;\n    rate = strike + numOptions;\n    volatility = rate + numOptions;\n    otime = volatility + numOptions;\n\n    buffer2 = (int *) malloc(numOptions * sizeof(fptype) + PAD);\n    otype = (int *) (((unsigned long long)buffer2 + PAD) & ~(LINESIZE - 1));\n\n    for (i=0; i<numOptions; i++) {\n        otype[i]      = (data[i].OptionType == 'P') ? 1 : 0;\n        sptprice[i]   = data[i].s;\n        strike[i]     = data[i].strike;\n        rate[i]       = data[i].r;\n        volatility[i] = data[i].v;    \n        otime[i]      = data[i].t;\n    }\n\n    printf(\"Size of data: %d\\n\", numOptions * (sizeof(OptionData) + sizeof(int)));\n\n#ifdef ENABLE_PARSEC_HOOKS\n    __parsec_roi_begin();\n#endif\n\n#ifdef ENABLE_THREADS\n#ifdef WIN32\n    HANDLE *threads;\n    int *nums;\n    threads = (HANDLE *) malloc (nThreads * sizeof(HANDLE));\n    nums = (int *) malloc (nThreads * sizeof(int));\n\n    for(i=0; i<nThreads; i++) {\n        nums[i] = i;\n        threads[i] = CreateThread(0, 0, bs_thread, &nums[i], 0, 0);\n    }\n    WaitForMultipleObjects(nThreads, threads, TRUE, INFINITE);\n    free(threads);\n    free(nums);\n#else\n    int *tids;\n    tids = (int *) malloc (nThreads * sizeof(int));\n\n    for(i=0; i<nThreads; i++) {\n        tids[i]=i;\n        CREATE_WITH_ARG(bs_thread, &tids[i]);\n    }\n    WAIT_FOR_END(nThreads);\n    free(tids);\n#endif //WIN32\n#else //ENABLE_THREADS\n#ifdef ENABLE_OPENMP\n    {\n        int tid=0;\n        omp_set_num_threads(nThreads);\n        bs_thread(&tid);\n    }\n#else //ENABLE_OPENMP\n#ifdef ENABLE_TBB\n    tbb::task_scheduler_init init(nThreads);\n\n    int tid=0;\n    bs_thread(&tid);\n#else //ENABLE_TBB\n    //serial version\n    int tid=0;\n    bs_thread(&tid);\n#endif //ENABLE_TBB\n#endif //ENABLE_OPENMP\n#endif //ENABLE_THREADS\n\n#ifdef ENABLE_PARSEC_HOOKS\n    __parsec_roi_end();\n#endif\n\n    //Write prices to output file\n    file = fopen(outputFile, \"w\");\n    if(file == NULL) {\n      printf(\"ERROR: Unable to open file `%s'.\\n\", outputFile);\n      exit(1);\n    }\n    rv = fprintf(file, \"%i\\n\", numOptions);\n    if(rv < 0) {\n      printf(\"ERROR: Unable to write to file `%s'.\\n\", outputFile);\n      fclose(file);\n      exit(1);\n    }\n    for(i=0; i<numOptions; i++) {\n      rv = fprintf(file, \"%.18f\\n\", prices[i]);\n      if(rv < 0) {\n        printf(\"ERROR: Unable to write to file `%s'.\\n\", outputFile);\n        fclose(file);\n        exit(1);\n      }\n    }\n    rv = fclose(file);\n    if(rv != 0) {\n      printf(\"ERROR: Unable to close file `%s'.\\n\", outputFile);\n      exit(1);\n    }\n\n#ifdef ERR_CHK\n    printf(\"Num Errors: %d\\n\", numError);\n#endif\n    free(data);\n    free(prices);\n\n#ifdef ENABLE_PARSEC_HOOKS\n    __parsec_bench_end();\n#endif\n\n    return 0;\n}\n", "label": 2}
{"code": "/*----------------------------------------------------------------------\n  PuReMD - Purdue ReaxFF Molecular Dynamics Program\n\n  Copyright (2010) Purdue University\n  Hasan Metin Aktulga, haktulga@cs.purdue.edu\n  Joseph Fogarty, jcfogart@mail.usf.edu\n  Sagar Pandit, pandit@usf.edu\n  Ananth Y Grama, ayg@cs.purdue.edu\n\n  This program is free software; you can redistribute it and/or\n  modify it under the terms of the GNU General Public License as\n  published by the Free Software Foundation; either version 2 of\n  the License, or (at your option) any later version.\n\n  This program is distributed in the hope that it will be useful,\n  but WITHOUT ANY WARRANTY; without even the implied warranty of\n  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n  See the GNU General Public License for more details:\n  <http://www.gnu.org/licenses/>.\n  ----------------------------------------------------------------------*/\n\n#include \"puremd.h\"\n\n#include \"allocate.h\"\n#include \"analyze.h\"\n#include \"comm_tools.h\"\n#include \"control.h\"\n#include \"ffield.h\"\n#include \"forces.h\"\n#include \"geo_tools.h\"\n#include \"init_md.h\"\n#include \"integrate.h\"\n#include \"io_tools.h\"\n#include \"neighbors.h\"\n#include \"reset_tools.h\"\n#include \"restart.h\"\n#include \"system_props.h\"\n#include \"tool_box.h\"\n#include \"traj.h\"\n#include \"vector.h\"\n\n#if defined(HAVE_CUDA)\n  #include \"cuda/cuda_copy.h\"\n  #include \"cuda/cuda_environment.h\"\n  #include \"cuda/cuda_forces.h\"\n  #include \"cuda/cuda_init_md.h\"\n  #include \"cuda/cuda_neighbors.h\"\n  #include \"cuda/cuda_post_evolve.h\"\n  #include \"cuda/cuda_reset_tools.h\"\n  #include \"cuda/cuda_system_props.h\"\n#elif defined(HAVE_HIP)\n  #include \"hip/hip_copy.h\"\n  #include \"hip/hip_environment.h\"\n  #include \"hip/hip_forces.h\"\n  #include \"hip/hip_init_md.h\"\n  #include \"hip/hip_neighbors.h\"\n  #include \"hip/hip_post_evolve.h\"\n  #include \"hip/hip_reset_tools.h\"\n  #include \"hip/hip_system_props.h\"\n#endif\n\n\nstatic void Read_Config_Files( const char * const geo_file,\n        const char * const ffield_file,\n        const char * const control_file,\n        reax_system * const system, control_params * const control,\n        simulation_data * const data, storage * const workspace,\n        output_controls * const out_control, mpi_datatypes * const mpi_data )\n{\n    Read_Force_Field_File( ffield_file, &system->reax_param, system, control );\n\n    Read_Control_File( control_file, control, out_control );\n\n    if ( control->geo_format == CUSTOM )\n    {\n        Read_Geo_File( geo_file, system, control, data, workspace, mpi_data );\n    }\n    else if ( control->geo_format == PDB )\n    {\n        Read_PDB_File( geo_file, system, control, data, workspace, mpi_data );\n    }\n    else if ( control->geo_format == BGF )\n    {\n        Read_BGF( geo_file, system, control, data, workspace, mpi_data );\n    }\n    else if ( control->geo_format == ASCII_RESTART )\n    {\n        Read_Restart_File( geo_file, system, control, data, workspace, mpi_data );\n        control->restart = 1;\n    }\n    else if ( control->geo_format == BINARY_RESTART )\n    {\n        Read_Binary_Restart_File( geo_file, system, control, data, workspace, mpi_data );\n        control->restart = 1;\n    }\n    else\n    {\n        fprintf( stderr, \"[ERROR] unknown geo file format. terminating!\\n\" );\n        MPI_Abort( MPI_COMM_WORLD, INVALID_GEO );\n    }\n}\n\n\n#if defined(HAVE_CUDA)\nstatic void Cuda_Post_Evolve( reax_system * const system, control_params * const control,\n        simulation_data * const data, storage * const workspace, reax_list ** const lists,\n        output_controls * const out_control, mpi_datatypes * const mpi_data )\n{\n    /* remove translational and rotational velocity of the center of mass from system */\n    if ( control->ensemble != NVE && control->remove_CoM_vel > 0\n            && data->step % control->remove_CoM_vel == 0 )\n    {\n        /* compute velocity of the center of mass */\n        Cuda_Compute_Center_of_Mass( system, control, workspace,\n                data, mpi_data, mpi_data->comm_mesh3D );\n\n        Cuda_Remove_CoM_Velocities( system, control, data );\n    }\n\n    if ( control->ensemble == NVE )\n    {\n        /* compute kinetic energy of the system */\n        Cuda_Compute_Kinetic_Energy( system, control, workspace,\n                data, mpi_data->comm_mesh3D );\n    }\n\n    if ( (out_control->energy_update_freq > 0\n                && (data->step - data->prev_steps) % out_control->energy_update_freq == 0)\n            || (out_control->write_steps > 0\n                && data->step % out_control->write_steps == 0) )\n    {\n        Compute_Total_Energy( system, control, data, MPI_COMM_WORLD );\n    }\n}\n\n\n#elif defined(HAVE_HIP)\nstatic void Hip_Post_Evolve( reax_system * const system, control_params * const control,\n        simulation_data * const data, storage * const workspace, reax_list ** const lists,\n        output_controls * const out_control, mpi_datatypes * const mpi_data )\n{\n    /* remove translational and rotational velocity of the center of mass from system */\n    if ( control->ensemble != NVE && control->remove_CoM_vel > 0\n            && data->step % control->remove_CoM_vel == 0 )\n    {\n        /* compute velocity of the center of mass */\n        Hip_Compute_Center_of_Mass( system, control, workspace,\n                data, mpi_data, mpi_data->comm_mesh3D );\n\n        Hip_Remove_CoM_Velocities( system, control, data );\n    }\n\n    if ( control->ensemble == NVE )\n    {\n        /* compute kinetic energy of the system */\n        Hip_Compute_Kinetic_Energy( system, control, workspace,\n                data, mpi_data->comm_mesh3D );\n    }\n\n    if ( (out_control->energy_update_freq > 0\n                && (data->step - data->prev_steps) % out_control->energy_update_freq == 0)\n            || (out_control->write_steps > 0\n                && data->step % out_control->write_steps == 0) )\n    {\n        Compute_Total_Energy( system, control, data, MPI_COMM_WORLD );\n    }\n}\n\n\n#else\nstatic void Post_Evolve( reax_system * const system, control_params * const control,\n        simulation_data * const data, storage * const workspace, reax_list ** const lists,\n        output_controls * const out_control, mpi_datatypes * const mpi_data )\n{\n    int i;\n    rvec diff, cross;\n\n    /* remove translational and rotational velocity of the center of mass from system */\n    if ( control->ensemble != NVE && control->remove_CoM_vel > 0\n            && data->step % control->remove_CoM_vel == 0 )\n    {\n        /* compute velocity of the center of mass */\n        Compute_Center_of_Mass( system, data, mpi_data, mpi_data->comm_mesh3D );\n\n        for ( i = 0; i < system->n; i++ )\n        {\n            /* remove translational term */\n            rvec_ScaledAdd( system->my_atoms[i].v, -1.0, data->vcm );\n\n            /* remove rotational term */\n            rvec_ScaledSum( diff, 1.0, system->my_atoms[i].x, -1.0, data->xcm );\n            rvec_Cross( cross, data->avcm, diff );\n            rvec_ScaledAdd( system->my_atoms[i].v, -1.0, cross );\n        }\n    }\n\n    if ( control->ensemble == NVE )\n    {\n        /* compute kinetic energy of system */\n        Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );\n    }\n\n    if ( (out_control->energy_update_freq > 0\n                && (data->step - data->prev_steps) % out_control->energy_update_freq == 0)\n            || (out_control->write_steps > 0\n                && data->step % out_control->write_steps == 0) )\n    {\n        Compute_Total_Energy( system, control, data, MPI_COMM_WORLD );\n    }\n}\n#endif\n\n\nvoid* setup( const char * const geo_file, const char * const ffield_file,\n        const char * const control_file )\n{\n    int i;\n    puremd_handle *pmd_handle;\n\n    /* top-level allocation */\n    pmd_handle = (puremd_handle*) smalloc( sizeof(puremd_handle),\n            __FILE__, __LINE__ );\n\n    /* second-level allocations */\n    pmd_handle->system = smalloc( sizeof(reax_system), __FILE__, __LINE__ );\n    pmd_handle->control = smalloc( sizeof(control_params), __FILE__, __LINE__ );\n    pmd_handle->data = smalloc( sizeof(simulation_data), __FILE__, __LINE__ );\n    pmd_handle->workspace = smalloc( sizeof(storage), __FILE__, __LINE__ );\n#if defined(HAVE_CUDA) || defined(HAVE_HIP)\n    pmd_handle->workspace->d_workspace = smalloc( sizeof(storage), __FILE__, __LINE__ );\n#endif\n    pmd_handle->lists = smalloc( sizeof(reax_list *) * LIST_N, __FILE__, __LINE__ );\n    for ( i = 0; i < LIST_N; ++i )\n    {\n        pmd_handle->lists[i] = smalloc( sizeof(reax_list), __FILE__, __LINE__ );\n        pmd_handle->lists[i]->allocated = FALSE;\n    }\n    pmd_handle->out_control = smalloc( sizeof(output_controls), __FILE__, __LINE__ );\n    pmd_handle->mpi_data = smalloc( sizeof(mpi_datatypes), __FILE__, __LINE__ );\n\n    pmd_handle->output_enabled = TRUE;\n    pmd_handle->callback = NULL;\n\n    /* setup MPI environment */\n    MPI_Comm_size( MPI_COMM_WORLD, &pmd_handle->control->nprocs );\n    MPI_Comm_rank( MPI_COMM_WORLD, &pmd_handle->system->my_rank );\n\n#if defined(DEBUG)\n    fprintf( stderr, \"[INFO] MPI timer resolution: %f\\n\", MPI_Wtick( ) );\n#endif\n\n    /* initialize logging timing and\n     * globally synchronize clocks across all MPI processes */\n    MPI_Barrier( MPI_COMM_WORLD );\n    pmd_handle->data->timing.start = Get_Time( );\n\n    /* read system config files */\n    Read_Config_Files( geo_file, ffield_file, control_file,\n            pmd_handle->system, pmd_handle->control, pmd_handle->data,\n            pmd_handle->workspace, pmd_handle->out_control, pmd_handle->mpi_data );\n\n#if defined(HAVE_CUDA)\n    Cuda_Setup_Environment( pmd_handle->system, pmd_handle->control );\n#elif defined(HAVE_HIP)\n    Hip_Setup_Environment( pmd_handle->system, pmd_handle->control );\n#endif\n\n    return (void*) pmd_handle;\n}\n\n\nint setup_callback( const void * const handle, const callback_function callback  )\n{\n    int ret;\n    puremd_handle *pmd_handle;\n\n\n    ret = PUREMD_FAILURE;\n\n    if ( handle != NULL && callback != NULL )\n    {\n        pmd_handle = (puremd_handle*) handle;\n        pmd_handle->callback = callback;\n        ret = PUREMD_SUCCESS;\n    }\n\n    return ret;\n}\n\n\nint simulate( const void * const handle )\n{\n    int ret, ret_pmd, retries;\n    reax_system *system;\n    control_params *control;\n    simulation_data *data;\n    storage *workspace;\n    reax_list **lists;\n    output_controls *out_control;\n    mpi_datatypes *mpi_data;\n    puremd_handle *pmd_handle;\n\n    ret_pmd = PUREMD_FAILURE;\n\n    if ( handle != NULL )\n    {\n        pmd_handle = (puremd_handle*) handle;\n\n        system = pmd_handle->system;\n        control = pmd_handle->control;\n        data = pmd_handle->data;\n        workspace = pmd_handle->workspace;\n        lists = pmd_handle->lists;\n        out_control = pmd_handle->out_control;\n        mpi_data = pmd_handle->mpi_data;\n\n#if defined(HAVE_CUDA)\n        Cuda_Initialize( system, control, data, workspace, lists, out_control, mpi_data );\n\n        /* compute f_0 */\n        Comm_Atoms( system, control, data, workspace, mpi_data, TRUE );\n\n#if defined(GPU_DEVICE_PACK)\n        //TODO: remove once Comm_Atoms ported\n        Cuda_Copy_MPI_Data_Host_to_Device( control, mpi_data );\n#endif\n\n        Cuda_Init_Block_Sizes( system, control );\n\n        Cuda_Copy_Atoms_Host_to_Device( system, control );\n        Cuda_Copy_Grid_Host_to_Device( control, &system->my_grid, &system->d_my_grid );\n\n        Cuda_Reset( system, control, data, workspace, lists );\n\n        Cuda_Generate_Neighbor_Lists( system, control, data, workspace, lists );\n\n        Cuda_Compute_Forces( system, control, data, workspace, lists,\n                out_control, mpi_data );\n\n        Cuda_Compute_Kinetic_Energy( system, control, workspace,\n                data, mpi_data->comm_mesh3D );\n\n        Compute_Total_Energy( system, control, data, MPI_COMM_WORLD );\n\n        Output_Results( system, control, data, lists, out_control, mpi_data );\n\n        Check_Energy( data );\n\n#if defined(DEBUG_FOCUS)\n        Cuda_Print_Mem_Usage( data );\n#endif\n\n        ++data->step;\n        retries = 0;\n        while ( data->step <= control->nsteps && retries < MAX_RETRIES )\n        {\n            ret = SUCCESS;\n\n            if ( control->T_mode > 0 && retries == 0 )\n            {\n                Temperature_Control( control, data );\n            }\n    \n            ret = control->Cuda_Evolve( system, control, data, workspace,\n                    lists, out_control, mpi_data );\n\n            if ( ret == SUCCESS )\n            {\n                Cuda_Post_Evolve( system, control, data, workspace, lists,\n                        out_control, mpi_data );\n            }\n\n            if ( ret == SUCCESS )\n            {\n                data->timing.num_retries = retries;\n\n                Output_Results( system, control, data, lists, out_control, mpi_data );\n\n//              Analysis( system, control, data, workspace, lists, out_control, mpi_data );\n\n                if ( out_control->restart_freq\n                        && (data->step - data->prev_steps) % out_control->restart_freq == 0 )\n                {\n                    if ( out_control->restart_format == WRITE_ASCII )\n                    {\n                        Write_Restart_File( system, control, data, out_control, mpi_data );\n                    }\n                    else if ( out_control->restart_format == WRITE_BINARY )\n                    {\n                        Write_Binary_Restart_File( system, control, data, out_control, mpi_data );\n                    }\n                }\n\n                Check_Energy( data );\n\n                ++data->step;\n                retries = 0;\n            }\n            else\n            {\n                ++retries;\n\n#if defined(DEBUG_FOCUS)\n                fprintf( stderr, \"[INFO] p%d: retrying step %d...\\n\", system->my_rank, data->step );\n#endif\n            }\n\n#if defined(DEBUG_FOCUS)\n            Cuda_Print_Mem_Usage( data );\n#endif\n        }\n\n        if ( retries >= MAX_RETRIES )\n        {\n            fprintf( stderr, \"[ERROR] Maximum retries reached for this step (%d). Terminating...\\n\",\n                  retries );\n            MPI_Abort( MPI_COMM_WORLD, MAX_RETRIES_REACHED );\n        }\n\n#elif defined(HAVE_HIP)\n        Hip_Initialize( system, control, data, workspace, lists, out_control, mpi_data );\n\n        /* compute f_0 */\n        Comm_Atoms( system, control, data, workspace, mpi_data, TRUE );\n\n#if defined(GPU_DEVICE_PACK)\n        //TODO: remove once Comm_Atoms ported\n        Hip_Copy_MPI_Data_Host_to_Device( control, mpi_data );\n#endif\n\n        Hip_Init_Block_Sizes( system, control );\n\n        Hip_Copy_Atoms_Host_to_Device( system, control );\n        Hip_Copy_Grid_Host_to_Device( control, &system->my_grid, &system->d_my_grid );\n\n        Hip_Reset( system, control, data, workspace, lists );\n\n        Hip_Generate_Neighbor_Lists( system, control, data, workspace, lists );\n\n        Hip_Compute_Forces( system, control, data, workspace, lists,\n                out_control, mpi_data );\n\n        Hip_Compute_Kinetic_Energy( system, control, workspace,\n                data, mpi_data->comm_mesh3D );\n\n        Compute_Total_Energy( system, control, data, MPI_COMM_WORLD );\n\n        Output_Results( system, control, data, lists, out_control, mpi_data );\n\n        Check_Energy( data );\n\n#if defined(DEBUG_FOCUS)\n        Hip_Print_Mem_Usage( data );\n#endif\n\n        ++data->step;\n        retries = 0;\n        while ( data->step <= control->nsteps && retries < MAX_RETRIES )\n        {\n            ret = SUCCESS;\n\n            if ( control->T_mode > 0 && retries == 0 )\n            {\n                Temperature_Control( control, data );\n            }\n    \n            ret = control->Hip_Evolve( system, control, data, workspace,\n                    lists, out_control, mpi_data );\n\n            if ( ret == SUCCESS )\n            {\n                Hip_Post_Evolve( system, control, data, workspace, lists,\n                        out_control, mpi_data );\n            }\n\n            if ( ret == SUCCESS )\n            {\n                data->timing.num_retries = retries;\n\n                Output_Results( system, control, data, lists, out_control, mpi_data );\n\n//              Analysis( system, control, data, workspace, lists, out_control, mpi_data );\n\n                if ( out_control->restart_freq\n                        && (data->step - data->prev_steps) % out_control->restart_freq == 0 )\n                {\n                    if ( out_control->restart_format == WRITE_ASCII )\n                    {\n                        Write_Restart_File( system, control, data, out_control, mpi_data );\n                    }\n                    else if ( out_control->restart_format == WRITE_BINARY )\n                    {\n                        Write_Binary_Restart_File( system, control, data, out_control, mpi_data );\n                    }\n                }\n\n                Check_Energy( data );\n\n                ++data->step;\n                retries = 0;\n            }\n            else\n            {\n                ++retries;\n\n#if defined(DEBUG_FOCUS)\n                fprintf( stderr, \"[INFO] p%d: retrying step %d...\\n\", system->my_rank, data->step );\n#endif\n            }\n\n#if defined(DEBUG_FOCUS)\n            Hip_Print_Mem_Usage( data );\n#endif\n        }\n\n        if ( retries >= MAX_RETRIES )\n        {\n            fprintf( stderr, \"[ERROR] Maximum retries reached for this step (%d). Terminating...\\n\",\n                  retries );\n            MPI_Abort( MPI_COMM_WORLD, MAX_RETRIES_REACHED );\n        }\n\n#else \n        Initialize( system, control, data, workspace, lists, out_control, mpi_data );\n       \n        /* compute f_0 */\n        Comm_Atoms( system, control, data, workspace, mpi_data, TRUE );\n\n        Reset( system, control, data, workspace, lists );\n\n        ret = Generate_Neighbor_Lists( system, control, data, workspace, lists );\n\n        if ( ret != SUCCESS )\n        {\n            fprintf( stderr, \"[ERROR] cannot generate initial neighbor lists. Terminating...\\n\" );\n            MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );\n        }\n\n        ret = Compute_Forces( system, control, data, workspace, lists, out_control, mpi_data );\n\n        if ( ret != SUCCESS )\n        {\n            fprintf( stderr, \"[ERROR] cannot compute initial forces. Terminating...\\n\" );\n            MPI_Abort( MPI_COMM_WORLD, CANNOT_INITIALIZE );\n        }\n\n        Compute_Kinetic_Energy( system, data, mpi_data->comm_mesh3D );\n\n        Compute_Total_Energy( system, control, data, MPI_COMM_WORLD );\n\n        Output_Results( system, control, data, lists, out_control, mpi_data );\n\n        Check_Energy( data );\n\n        retries = 0;\n        ++data->step;\n        while ( data->step <= control->nsteps && retries < MAX_RETRIES )\n        {\n            ret = SUCCESS;\n\n            if ( control->T_mode > 0 && retries == 0 )\n            {\n                Temperature_Control( control, data );\n            }\n\n            ret = control->Evolve( system, control, data, workspace,\n                    lists, out_control, mpi_data );\n\n            if ( ret == SUCCESS )\n            {\n                Post_Evolve( system, control, data, workspace,\n                        lists, out_control, mpi_data );\n\n                data->timing.num_retries = retries;\n                Output_Results( system, control, data, lists, out_control, mpi_data );\n\n//              Analysis( system, control, data, workspace, lists, out_control, mpi_data );\n\n                if ( out_control->restart_freq\n                        && (data->step - data->prev_steps) % out_control->restart_freq == 0 )\n                {\n                    if ( out_control->restart_format == WRITE_ASCII )\n                    {\n                        Write_Restart_File( system, control, data, out_control, mpi_data );\n                    }\n                    else if ( out_control->restart_format == WRITE_BINARY )\n                    {\n                        Write_Binary_Restart_File( system, control, data, out_control, mpi_data );\n                    }\n                }\n\n                Check_Energy( data );\n\n                ++data->step;\n                retries = 0;\n            }\n            else\n            {\n                ++retries;\n\n#if defined(DEBUG_FOCUS)\n                fprintf( stderr, \"[INFO] p%d: retrying step %d...\\n\", system->my_rank, data->step );\n#endif\n            }\n        }\n\n        if ( retries >= MAX_RETRIES )\n        {\n            fprintf( stderr, \"[ERROR] Maximum retries reached for this step (%d). Terminating...\\n\",\n                  retries );\n            MPI_Abort( MPI_COMM_WORLD, MAX_RETRIES_REACHED );\n        }\n#endif\n\n//      Write_PDB_File( system, lists[BONDS], data, control, mpi_data, out_control );\n\n        /* end of simulation, write total simulation time\n         * (excluding deallocation routine time) after\n         * globally synchronizing clocks across all MPI processes */\n        MPI_Barrier( MPI_COMM_WORLD );\n        if ( system->my_rank == MASTER_NODE )\n        {\n            fprintf( out_control->out, \"Total Simulation Time: %.2f secs\\n\",\n                    Get_Time( ) - data->timing.start );\n        }\n\n        ret_pmd = PUREMD_SUCCESS;\n    }\n\n    return ret_pmd;\n}\n\n\nint cleanup( const void * const handle )\n{\n    int ret_pmd;\n    puremd_handle *pmd_handle;\n\n    ret_pmd = PUREMD_FAILURE;\n\n    if ( handle != NULL )\n    {\n        pmd_handle = (puremd_handle*) handle;\n\n#if defined(HAVE_CUDA)\n        //TODO: add Cuda_Finalize( ... )\n\n        Cuda_Cleanup_Environment( pmd_handle->control );\n#elif defined(HAVE_HIP)\n        //TODO: add Hip_Finalize( ... )\n\n        Hip_Cleanup_Environment( pmd_handle->control );\n#else\n        Finalize( pmd_handle->system, pmd_handle->control, pmd_handle->data,\n                pmd_handle->workspace, pmd_handle->lists, pmd_handle->out_control,\n                pmd_handle->mpi_data, pmd_handle->output_enabled );\n#endif\n\n#if defined(HAVE_CUDA) || defined(HAVE_HIP)\n        sfree( pmd_handle->workspace->d_workspace, __FILE__, __LINE__ );\n#endif\n        sfree( pmd_handle->mpi_data, __FILE__, __LINE__ );\n        sfree( pmd_handle->out_control, __FILE__, __LINE__ );\n        sfree( pmd_handle->lists, __FILE__, __LINE__ );\n        sfree( pmd_handle->workspace, __FILE__, __LINE__ );\n        sfree( pmd_handle->data, __FILE__, __LINE__ );\n        sfree( pmd_handle->control, __FILE__, __LINE__ );\n        sfree( pmd_handle->system, __FILE__, __LINE__ );\n\n        sfree( pmd_handle, __FILE__, __LINE__ );\n\n        ret_pmd = PUREMD_SUCCESS;\n    }\n\n    return ret_pmd;\n}\n\n\nreax_atom* get_atoms( const void * const handle )\n{\n    puremd_handle *pmd_handle;\n    reax_atom *atoms;\n\n    atoms = NULL;\n\n    if ( handle != NULL )\n    {\n        pmd_handle = (puremd_handle*) handle;\n        atoms = pmd_handle->system->my_atoms;\n    }\n\n    return atoms;\n}\n\n\nint set_output_enabled( const void * const handle, const int enabled )\n{\n    int ret;\n    puremd_handle *pmd_handle;\n\n    ret = PUREMD_FAILURE;\n\n    if ( handle != NULL )\n    {\n        pmd_handle = (puremd_handle*) handle;\n        pmd_handle->output_enabled = enabled;\n        ret = PUREMD_SUCCESS;\n    }\n\n    return ret;\n}", "label": 1}
{"code": "/*\n\n                 Copyright (c) 2010.\n      Lawrence Livermore National Security, LLC.\nProduced at the Lawrence Livermore National Laboratory.\n                  LLNL-CODE-461231\n                All rights reserved.\n\nThis file is part of LULESH, Version 1.0.\nPlease also read this link -- http://www.opensource.org/licenses/index.php\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions\nare met:\n\n   * Redistributions of source code must retain the above copyright\n     notice, this list of conditions and the disclaimer below.\n\n   * Redistributions in binary form must reproduce the above copyright\n     notice, this list of conditions and the disclaimer (as noted below)\n     in the documentation and/or other materials provided with the\n     distribution.\n\n   * Neither the name of the LLNS/LLNL nor the names of its contributors\n     may be used to endorse or promote products derived from this software\n     without specific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\nARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC,\nTHE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,\nINDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,\nBUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\nOF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\nNEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,\nEVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n\nAdditional BSD Notice\n\n1. This notice is required to be provided under our contract with the U.S.\n   Department of Energy (DOE). This work was produced at Lawrence Livermore\n   National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.\n\n2. Neither the United States Government nor Lawrence Livermore National\n   Security, LLC nor any of their employees, makes any warranty, express\n   or implied, or assumes any liability or responsibility for the accuracy,\n   completeness, or usefulness of any information, apparatus, product, or\n   process disclosed, or represents that its use would not infringe\n   privately-owned rights.\n\n3. Also, reference herein to any specific commercial products, process, or\n   services by trade name, trademark, manufacturer or otherwise does not\n   necessarily constitute or imply its endorsement, recommendation, or\n   favoring by the United States Government or Lawrence Livermore National\n   Security, LLC. The views and opinions of authors expressed herein do not\n   necessarily state or reflect those of the United States Government or\n   Lawrence Livermore National Security, LLC, and shall not be used for\n   advertising or product endorsement purposes.\n\n*/\n\n#include <vector>\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <sys/time.h>\n\n//#define LULESH_SHOW_PROGRESS 1\n\nenum { VolumeError = -1, QStopError = -2 } ;\n\n/****************************************************/\n/* Allow flexibility for arithmetic representations */\n/****************************************************/\n\n/* Could also support fixed point and interval arithmetic types */\ntypedef float        real4 ;\ntypedef double       real8 ;\ntypedef long double  real10 ;  /* 10 bytes on x86 */\n\ntypedef int    Index_t ; /* array subscript and loop index */\ntypedef real8  Real_t ;  /* floating point representation */\ntypedef int    Int_t ;   /* integer representation */\n\ninline real4  SQRT(real4  arg) { return sqrtf(arg) ; }\ninline real8  SQRT(real8  arg) { return sqrt(arg) ; }\ninline real10 SQRT(real10 arg) { return sqrtl(arg) ; }\n\ninline real4  CBRT(real4  arg) { return cbrtf(arg) ; }\ninline real8  CBRT(real8  arg) { return cbrt(arg) ; }\ninline real10 CBRT(real10 arg) { return cbrtl(arg) ; }\n\ninline real4  FABS(real4  arg) { return fabsf(arg) ; }\ninline real8  FABS(real8  arg) { return fabs(arg) ; }\ninline real10 FABS(real10 arg) { return fabsl(arg) ; }\n\n\n/************************************************************/\n/* Allow for flexible data layout experiments by separating */\n/* array interface from underlying implementation.          */\n/************************************************************/\n\nstruct Mesh {\n\n/* This first implementation allows for runnable code */\n/* and is not meant to be optimal. Final implementation */\n/* should separate declaration and allocation phases */\n/* so that allocation can be scheduled in a cache conscious */\n/* manner. */\n\npublic:\n\n   /**************/\n   /* Allocation */\n   /**************/\n\n   __attribute__((noinline)) void AllocateNodalPersistent(size_t size)\n   {\n      m_x.resize(size) ;\n      m_y.resize(size) ;\n      m_z.resize(size) ;\n\n      m_xd.resize(size, Real_t(0.)) ;\n      m_yd.resize(size, Real_t(0.)) ;\n      m_zd.resize(size, Real_t(0.)) ;\n\n      m_xdd.resize(size, Real_t(0.)) ;\n      m_ydd.resize(size, Real_t(0.)) ;\n      m_zdd.resize(size, Real_t(0.)) ;\n\n      m_fx.resize(size) ;\n      m_fy.resize(size) ;\n      m_fz.resize(size) ;\n\n      m_nodalMass.resize(size, Real_t(0.)) ;\n   }\n\n    __attribute__((noinline)) void AllocateElemPersistent(size_t size)\n   {\n      m_matElemlist.resize(size) ;\n      m_nodelist.resize(8*size) ;\n\n      m_lxim.resize(size) ;\n      m_lxip.resize(size) ;\n      m_letam.resize(size) ;\n      m_letap.resize(size) ;\n      m_lzetam.resize(size) ;\n      m_lzetap.resize(size) ;\n\n      m_elemBC.resize(size) ;\n\n      m_e.resize(size, Real_t(0.)) ;\n\n      m_p.resize(size, Real_t(0.)) ;\n      m_q.resize(size) ;\n      m_ql.resize(size) ;\n      m_qq.resize(size) ;\n\n      m_v.resize(size, 1.0) ;\n      m_volo.resize(size) ;\n      m_delv.resize(size) ;\n      m_vdov.resize(size) ;\n\n      m_arealg.resize(size) ;\n   \n      m_ss.resize(size) ;\n\n      m_elemMass.resize(size) ;\n   }\n\n   /* Temporaries should not be initialized in bulk but */\n   /* this is a runnable placeholder for now */\n    __attribute__((noinline)) void AllocateElemTemporary(size_t size)\n   {\n      m_dxx.resize(size) ;\n      m_dyy.resize(size) ;\n      m_dzz.resize(size) ;\n\n      m_delv_xi.resize(size) ;\n      m_delv_eta.resize(size) ;\n      m_delv_zeta.resize(size) ;\n\n      m_delx_xi.resize(size) ;\n      m_delx_eta.resize(size) ;\n      m_delx_zeta.resize(size) ;\n\n      m_vnew.resize(size) ;\n   }\n\n    __attribute__((noinline)) void AllocateNodesets(size_t size)\n   {\n      m_symmX.resize(size) ;\n      m_symmY.resize(size) ;\n      m_symmZ.resize(size) ;\n   }\n   \n   /**********/\n   /* Access */\n   /**********/\n\n   /* Node-centered */\n\n   Real_t& x(Index_t idx)    { return m_x[idx] ; }\n   Real_t& y(Index_t idx)    { return m_y[idx] ; }\n   Real_t& z(Index_t idx)    { return m_z[idx] ; }\n\n   Real_t& xd(Index_t idx)   { return m_xd[idx] ; }\n   Real_t& yd(Index_t idx)   { return m_yd[idx] ; }\n   Real_t& zd(Index_t idx)   { return m_zd[idx] ; }\n\n   Real_t& xdd(Index_t idx)  { return m_xdd[idx] ; }\n   Real_t& ydd(Index_t idx)  { return m_ydd[idx] ; }\n   Real_t& zdd(Index_t idx)  { return m_zdd[idx] ; }\n\n   Real_t& fx(Index_t idx)   { return m_fx[idx] ; }\n   Real_t& fy(Index_t idx)   { return m_fy[idx] ; }\n   Real_t& fz(Index_t idx)   { return m_fz[idx] ; }\n\n   Real_t& nodalMass(Index_t idx) { return m_nodalMass[idx] ; }\n\n   Index_t&  symmX(Index_t idx) { return m_symmX[idx] ; }\n   Index_t&  symmY(Index_t idx) { return m_symmY[idx] ; }\n   Index_t&  symmZ(Index_t idx) { return m_symmZ[idx] ; }\n\n   /* Element-centered */\n\n   Index_t&  matElemlist(Index_t idx) { return m_matElemlist[idx] ; }\n   Index_t*  nodelist(Index_t idx)    { return &m_nodelist[Index_t(8)*idx] ; }\n\n   Index_t&  lxim(Index_t idx) { return m_lxim[idx] ; }\n   Index_t&  lxip(Index_t idx) { return m_lxip[idx] ; }\n   Index_t&  letam(Index_t idx) { return m_letam[idx] ; }\n   Index_t&  letap(Index_t idx) { return m_letap[idx] ; }\n   Index_t&  lzetam(Index_t idx) { return m_lzetam[idx] ; }\n   Index_t&  lzetap(Index_t idx) { return m_lzetap[idx] ; }\n\n   Int_t&  elemBC(Index_t idx) { return m_elemBC[idx] ; }\n\n   Real_t& dxx(Index_t idx)  { return m_dxx[idx] ; }\n   Real_t& dyy(Index_t idx)  { return m_dyy[idx] ; }\n   Real_t& dzz(Index_t idx)  { return m_dzz[idx] ; }\n\n   Real_t& delv_xi(Index_t idx)    { return m_delv_xi[idx] ; }\n   Real_t& delv_eta(Index_t idx)   { return m_delv_eta[idx] ; }\n   Real_t& delv_zeta(Index_t idx)  { return m_delv_zeta[idx] ; }\n\n   Real_t& delx_xi(Index_t idx)    { return m_delx_xi[idx] ; }\n   Real_t& delx_eta(Index_t idx)   { return m_delx_eta[idx] ; }\n   Real_t& delx_zeta(Index_t idx)  { return m_delx_zeta[idx] ; }\n\n   Real_t& e(Index_t idx)          { return m_e[idx] ; }\n\n   Real_t& p(Index_t idx)          { return m_p[idx] ; }\n   Real_t& q(Index_t idx)          { return m_q[idx] ; }\n   Real_t& ql(Index_t idx)         { return m_ql[idx] ; }\n   Real_t& qq(Index_t idx)         { return m_qq[idx] ; }\n\n   Real_t& v(Index_t idx)          { return m_v[idx] ; }\n   Real_t& volo(Index_t idx)       { return m_volo[idx] ; }\n   Real_t& vnew(Index_t idx)       { return m_vnew[idx] ; }\n   Real_t& delv(Index_t idx)       { return m_delv[idx] ; }\n   Real_t& vdov(Index_t idx)       { return m_vdov[idx] ; }\n\n   Real_t& arealg(Index_t idx)     { return m_arealg[idx] ; }\n   \n   Real_t& ss(Index_t idx)         { return m_ss[idx] ; }\n\n   Real_t& elemMass(Index_t idx)  { return m_elemMass[idx] ; }\n\n   /* Params */\n\n   Real_t& dtfixed()              { return m_dtfixed ; }\n   Real_t& time()                 { return m_time ; }\n   Real_t& deltatime()            { return m_deltatime ; }\n   Real_t& deltatimemultlb()      { return m_deltatimemultlb ; }\n   Real_t& deltatimemultub()      { return m_deltatimemultub ; }\n   Real_t& stoptime()             { return m_stoptime ; }\n\n   Real_t& u_cut()                { return m_u_cut ; }\n   Real_t& hgcoef()               { return m_hgcoef ; }\n   Real_t& qstop()                { return m_qstop ; }\n   Real_t& monoq_max_slope()      { return m_monoq_max_slope ; }\n   Real_t& monoq_limiter_mult()   { return m_monoq_limiter_mult ; }\n   Real_t& e_cut()                { return m_e_cut ; }\n   Real_t& p_cut()                { return m_p_cut ; }\n   Real_t& ss4o3()                { return m_ss4o3 ; }\n   Real_t& q_cut()                { return m_q_cut ; }\n   Real_t& v_cut()                { return m_v_cut ; }\n   Real_t& qlc_monoq()            { return m_qlc_monoq ; }\n   Real_t& qqc_monoq()            { return m_qqc_monoq ; }\n   Real_t& qqc()                  { return m_qqc ; }\n   Real_t& eosvmax()              { return m_eosvmax ; }\n   Real_t& eosvmin()              { return m_eosvmin ; }\n   Real_t& pmin()                 { return m_pmin ; }\n   Real_t& emin()                 { return m_emin ; }\n   Real_t& dvovmax()              { return m_dvovmax ; }\n   Real_t& refdens()              { return m_refdens ; }\n\n   Real_t& dtcourant()            { return m_dtcourant ; }\n   Real_t& dthydro()              { return m_dthydro ; }\n   Real_t& dtmax()                { return m_dtmax ; }\n\n   Int_t&  cycle()                { return m_cycle ; }\n\n   Index_t&  sizeX()              { return m_sizeX ; }\n   Index_t&  sizeY()              { return m_sizeY ; }\n   Index_t&  sizeZ()              { return m_sizeZ ; }\n   Index_t&  numElem()            { return m_numElem ; }\n   Index_t&  numNode()            { return m_numNode ; }\n\nprivate:\n\n   /******************/\n   /* Implementation */\n   /******************/\n\n   /* Node-centered */\n\n   std::vector<Real_t> m_x ;  /* coordinates */\n   std::vector<Real_t> m_y ;\n   std::vector<Real_t> m_z ;\n\n   std::vector<Real_t> m_xd ; /* velocities */\n   std::vector<Real_t> m_yd ;\n   std::vector<Real_t> m_zd ;\n\n   std::vector<Real_t> m_xdd ; /* accelerations */\n   std::vector<Real_t> m_ydd ;\n   std::vector<Real_t> m_zdd ;\n\n   std::vector<Real_t> m_fx ;  /* forces */\n   std::vector<Real_t> m_fy ;\n   std::vector<Real_t> m_fz ;\n\n   std::vector<Real_t> m_nodalMass ;  /* mass */\n\n   std::vector<Index_t> m_symmX ;  /* symmetry plane nodesets */\n   std::vector<Index_t> m_symmY ;\n   std::vector<Index_t> m_symmZ ;\n\n   /* Element-centered */\n\n   std::vector<Index_t>  m_matElemlist ;  /* material indexset */\n   std::vector<Index_t>  m_nodelist ;     /* elemToNode connectivity */\n\n   std::vector<Index_t>  m_lxim ;  /* element connectivity across each face */\n   std::vector<Index_t>  m_lxip ;\n   std::vector<Index_t>  m_letam ;\n   std::vector<Index_t>  m_letap ;\n   std::vector<Index_t>  m_lzetam ;\n   std::vector<Index_t>  m_lzetap ;\n\n   std::vector<Int_t>    m_elemBC ;  /* symmetry/free-surface flags for each elem face */\n\n   std::vector<Real_t> m_dxx ;  /* principal strains -- temporary */\n   std::vector<Real_t> m_dyy ;\n   std::vector<Real_t> m_dzz ;\n\n   std::vector<Real_t> m_delv_xi ;    /* velocity gradient -- temporary */\n   std::vector<Real_t> m_delv_eta ;\n   std::vector<Real_t> m_delv_zeta ;\n\n   std::vector<Real_t> m_delx_xi ;    /* coordinate gradient -- temporary */\n   std::vector<Real_t> m_delx_eta ;\n   std::vector<Real_t> m_delx_zeta ;\n   \n   std::vector<Real_t> m_e ;   /* energy */\n\n   std::vector<Real_t> m_p ;   /* pressure */\n   std::vector<Real_t> m_q ;   /* q */\n   std::vector<Real_t> m_ql ;  /* linear term for q */\n   std::vector<Real_t> m_qq ;  /* quadratic term for q */\n\n   std::vector<Real_t> m_v ;     /* relative volume */\n   std::vector<Real_t> m_volo ;  /* reference volume */\n   std::vector<Real_t> m_vnew ;  /* new relative volume -- temporary */\n   std::vector<Real_t> m_delv ;  /* m_vnew - m_v */\n   std::vector<Real_t> m_vdov ;  /* volume derivative over volume */\n\n   std::vector<Real_t> m_arealg ;  /* characteristic length of an element */\n   \n   std::vector<Real_t> m_ss ;      /* \"sound speed\" */\n\n   std::vector<Real_t> m_elemMass ;  /* mass */\n\n   /* Parameters */\n\n   Real_t  m_dtfixed ;           /* fixed time increment */\n   Real_t  m_time ;              /* current time */\n   Real_t  m_deltatime ;         /* variable time increment */\n   Real_t  m_deltatimemultlb ;\n   Real_t  m_deltatimemultub ;\n   Real_t  m_stoptime ;          /* end time for simulation */\n\n   Real_t  m_u_cut ;             /* velocity tolerance */\n   Real_t  m_hgcoef ;            /* hourglass control */\n   Real_t  m_qstop ;             /* excessive q indicator */\n   Real_t  m_monoq_max_slope ;\n   Real_t  m_monoq_limiter_mult ;\n   Real_t  m_e_cut ;             /* energy tolerance */\n   Real_t  m_p_cut ;             /* pressure tolerance */\n   Real_t  m_ss4o3 ;\n   Real_t  m_q_cut ;             /* q tolerance */\n   Real_t  m_v_cut ;             /* relative volume tolerance */\n   Real_t  m_qlc_monoq ;         /* linear term coef for q */\n   Real_t  m_qqc_monoq ;         /* quadratic term coef for q */\n   Real_t  m_qqc ;\n   Real_t  m_eosvmax ;\n   Real_t  m_eosvmin ;\n   Real_t  m_pmin ;              /* pressure floor */\n   Real_t  m_emin ;              /* energy floor */\n   Real_t  m_dvovmax ;           /* maximum allowable volume change */\n   Real_t  m_refdens ;           /* reference density */\n\n   Real_t  m_dtcourant ;         /* courant constraint */\n   Real_t  m_dthydro ;           /* volume change constraint */\n   Real_t  m_dtmax ;             /* maximum allowable time increment */\n\n   Int_t   m_cycle ;             /* iteration count for simulation */\n\n   Index_t   m_sizeX ;           /* X,Y,Z extent of this block */\n   Index_t   m_sizeY ;\n   Index_t   m_sizeZ ;\n\n   Index_t   m_numElem ;         /* Elements/Nodes in this domain */\n   Index_t   m_numNode ;\n} mesh ;\n\n\ntemplate <typename T>\nT *Allocate(size_t size)\n{\n   return static_cast<T *>(malloc(sizeof(T)*size)) ;\n}\n\ntemplate <typename T>\nvoid Release(T **ptr)\n{\n   if (*ptr != NULL) {\n      free(*ptr) ;\n      *ptr = NULL ;\n   }\n}\n\n\n/* Stuff needed for boundary conditions */\n/* 2 BCs on each of 6 hexahedral faces (12 bits) */\n#define XI_M        0x003\n#define XI_M_SYMM   0x001\n#define XI_M_FREE   0x002\n\n#define XI_P        0x00c\n#define XI_P_SYMM   0x004\n#define XI_P_FREE   0x008\n\n#define ETA_M       0x030\n#define ETA_M_SYMM  0x010\n#define ETA_M_FREE  0x020\n\n#define ETA_P       0x0c0\n#define ETA_P_SYMM  0x040\n#define ETA_P_FREE  0x080\n\n#define ZETA_M      0x300\n#define ZETA_M_SYMM 0x100\n#define ZETA_M_FREE 0x200\n\n#define ZETA_P      0xc00\n#define ZETA_P_SYMM 0x400\n#define ZETA_P_FREE 0x800\n\n\n////static inline\n __attribute__((noinline)) void TimeIncrement()\n{\n   Real_t targetdt = mesh.stoptime() - mesh.time() ;\n\n   if ((mesh.dtfixed() <= Real_t(0.0)) && (mesh.cycle() != Int_t(0))) {\n      Real_t ratio ;\n      Real_t olddt = mesh.deltatime() ;\n\n      /* This will require a reduction in parallel */\n      Real_t newdt = Real_t(1.0e+20) ;\n      if (mesh.dtcourant() < newdt) {\n         newdt = mesh.dtcourant() / Real_t(2.0) ;\n      }\n      if (mesh.dthydro() < newdt) {\n         newdt = mesh.dthydro() * Real_t(2.0) / Real_t(3.0) ;\n      }\n\n      ratio = newdt / olddt ;\n      if (ratio >= Real_t(1.0)) {\n         if (ratio < mesh.deltatimemultlb()) {\n            newdt = olddt ;\n         }\n         else if (ratio > mesh.deltatimemultub()) {\n            newdt = olddt*mesh.deltatimemultub() ;\n         }\n      }\n\n      if (newdt > mesh.dtmax()) {\n         newdt = mesh.dtmax() ;\n      }\n      mesh.deltatime() = newdt ;\n   }\n\n   /* TRY TO PREVENT VERY SMALL SCALING ON THE NEXT CYCLE */\n   if ((targetdt > mesh.deltatime()) &&\n       (targetdt < (Real_t(4.0) * mesh.deltatime() / Real_t(3.0))) ) {\n      targetdt = Real_t(2.0) * mesh.deltatime() / Real_t(3.0) ;\n   }\n\n   if (targetdt < mesh.deltatime()) {\n      mesh.deltatime() = targetdt ;\n   }\n\n   mesh.time() += mesh.deltatime() ;\n\n   ++mesh.cycle() ;\n}\n\n////static inline\n __attribute__((noinline)) void InitStressTermsForElems(Index_t numElem, \n                             Real_t *sigxx, Real_t *sigyy, Real_t *sigzz)\n{\n   //\n   // pull in the stresses appropriate to the hydro integration\n   //\n   for (Index_t i = 0 ; i < numElem ; ++i){\n      sigxx[i] =  sigyy[i] = sigzz[i] =  - mesh.p(i) - mesh.q(i) ;\n   }\n}\n\n////static inline\n __attribute__((noinline)) void CalcElemShapeFunctionDerivatives( const Real_t* const x,\n                                       const Real_t* const y,\n                                       const Real_t* const z,\n                                       Real_t b[][8],\n                                       Real_t* const volume )\n{\n  const Real_t x0 = x[0] ;   const Real_t x1 = x[1] ;\n  const Real_t x2 = x[2] ;   const Real_t x3 = x[3] ;\n  const Real_t x4 = x[4] ;   const Real_t x5 = x[5] ;\n  const Real_t x6 = x[6] ;   const Real_t x7 = x[7] ;\n\n  const Real_t y0 = y[0] ;   const Real_t y1 = y[1] ;\n  const Real_t y2 = y[2] ;   const Real_t y3 = y[3] ;\n  const Real_t y4 = y[4] ;   const Real_t y5 = y[5] ;\n  const Real_t y6 = y[6] ;   const Real_t y7 = y[7] ;\n\n  const Real_t z0 = z[0] ;   const Real_t z1 = z[1] ;\n  const Real_t z2 = z[2] ;   const Real_t z3 = z[3] ;\n  const Real_t z4 = z[4] ;   const Real_t z5 = z[5] ;\n  const Real_t z6 = z[6] ;   const Real_t z7 = z[7] ;\n\n  Real_t fjxxi, fjxet, fjxze;\n  Real_t fjyxi, fjyet, fjyze;\n  Real_t fjzxi, fjzet, fjzze;\n  Real_t cjxxi, cjxet, cjxze;\n  Real_t cjyxi, cjyet, cjyze;\n  Real_t cjzxi, cjzet, cjzze;\n\n  fjxxi = .125 * ( (x6-x0) + (x5-x3) - (x7-x1) - (x4-x2) );\n  fjxet = .125 * ( (x6-x0) - (x5-x3) + (x7-x1) - (x4-x2) );\n  fjxze = .125 * ( (x6-x0) + (x5-x3) + (x7-x1) + (x4-x2) );\n\n  fjyxi = .125 * ( (y6-y0) + (y5-y3) - (y7-y1) - (y4-y2) );\n  fjyet = .125 * ( (y6-y0) - (y5-y3) + (y7-y1) - (y4-y2) );\n  fjyze = .125 * ( (y6-y0) + (y5-y3) + (y7-y1) + (y4-y2) );\n\n  fjzxi = .125 * ( (z6-z0) + (z5-z3) - (z7-z1) - (z4-z2) );\n  fjzet = .125 * ( (z6-z0) - (z5-z3) + (z7-z1) - (z4-z2) );\n  fjzze = .125 * ( (z6-z0) + (z5-z3) + (z7-z1) + (z4-z2) );\n\n  /* compute cofactors */\n  cjxxi =    (fjyet * fjzze) - (fjzet * fjyze);\n  cjxet =  - (fjyxi * fjzze) + (fjzxi * fjyze);\n  cjxze =    (fjyxi * fjzet) - (fjzxi * fjyet);\n\n  cjyxi =  - (fjxet * fjzze) + (fjzet * fjxze);\n  cjyet =    (fjxxi * fjzze) - (fjzxi * fjxze);\n  cjyze =  - (fjxxi * fjzet) + (fjzxi * fjxet);\n\n  cjzxi =    (fjxet * fjyze) - (fjyet * fjxze);\n  cjzet =  - (fjxxi * fjyze) + (fjyxi * fjxze);\n  cjzze =    (fjxxi * fjyet) - (fjyxi * fjxet);\n\n  /* calculate partials :\n     this need only be done for l = 0,1,2,3   since , by symmetry ,\n     (6,7,4,5) = - (0,1,2,3) .\n  */\n  b[0][0] =   -  cjxxi  -  cjxet  -  cjxze;\n  b[0][1] =      cjxxi  -  cjxet  -  cjxze;\n  b[0][2] =      cjxxi  +  cjxet  -  cjxze;\n  b[0][3] =   -  cjxxi  +  cjxet  -  cjxze;\n  b[0][4] = -b[0][2];\n  b[0][5] = -b[0][3];\n  b[0][6] = -b[0][0];\n  b[0][7] = -b[0][1];\n\n  b[1][0] =   -  cjyxi  -  cjyet  -  cjyze;\n  b[1][1] =      cjyxi  -  cjyet  -  cjyze;\n  b[1][2] =      cjyxi  +  cjyet  -  cjyze;\n  b[1][3] =   -  cjyxi  +  cjyet  -  cjyze;\n  b[1][4] = -b[1][2];\n  b[1][5] = -b[1][3];\n  b[1][6] = -b[1][0];\n  b[1][7] = -b[1][1];\n\n  b[2][0] =   -  cjzxi  -  cjzet  -  cjzze;\n  b[2][1] =      cjzxi  -  cjzet  -  cjzze;\n  b[2][2] =      cjzxi  +  cjzet  -  cjzze;\n  b[2][3] =   -  cjzxi  +  cjzet  -  cjzze;\n  b[2][4] = -b[2][2];\n  b[2][5] = -b[2][3];\n  b[2][6] = -b[2][0];\n  b[2][7] = -b[2][1];\n\n  /* calculate jacobian determinant (volume) */\n  *volume = Real_t(8.) * ( fjxet * cjxet + fjyet * cjyet + fjzet * cjzet);\n}\n\n////static inline\n __attribute__((noinline)) void SumElemFaceNormal(Real_t *normalX0, Real_t *normalY0, Real_t *normalZ0,\n                       Real_t *normalX1, Real_t *normalY1, Real_t *normalZ1,\n                       Real_t *normalX2, Real_t *normalY2, Real_t *normalZ2,\n                       Real_t *normalX3, Real_t *normalY3, Real_t *normalZ3,\n                       const Real_t x0, const Real_t y0, const Real_t z0,\n                       const Real_t x1, const Real_t y1, const Real_t z1,\n                       const Real_t x2, const Real_t y2, const Real_t z2,\n                       const Real_t x3, const Real_t y3, const Real_t z3)\n{\n   Real_t bisectX0 = Real_t(0.5) * (x3 + x2 - x1 - x0);\n   Real_t bisectY0 = Real_t(0.5) * (y3 + y2 - y1 - y0);\n   Real_t bisectZ0 = Real_t(0.5) * (z3 + z2 - z1 - z0);\n   Real_t bisectX1 = Real_t(0.5) * (x2 + x1 - x3 - x0);\n   Real_t bisectY1 = Real_t(0.5) * (y2 + y1 - y3 - y0);\n   Real_t bisectZ1 = Real_t(0.5) * (z2 + z1 - z3 - z0);\n   Real_t areaX = Real_t(0.25) * (bisectY0 * bisectZ1 - bisectZ0 * bisectY1);\n   Real_t areaY = Real_t(0.25) * (bisectZ0 * bisectX1 - bisectX0 * bisectZ1);\n   Real_t areaZ = Real_t(0.25) * (bisectX0 * bisectY1 - bisectY0 * bisectX1);\n\n   *normalX0 += areaX;\n   *normalX1 += areaX;\n   *normalX2 += areaX;\n   *normalX3 += areaX;\n\n   *normalY0 += areaY;\n   *normalY1 += areaY;\n   *normalY2 += areaY;\n   *normalY3 += areaY;\n\n   *normalZ0 += areaZ;\n   *normalZ1 += areaZ;\n   *normalZ2 += areaZ;\n   *normalZ3 += areaZ;\n}\n\n////static inline\n __attribute__((noinline)) void CalcElemNodeNormals(Real_t pfx[8],\n                         Real_t pfy[8],\n                         Real_t pfz[8],\n                         const Real_t x[8],\n                         const Real_t y[8],\n                         const Real_t z[8])\n{\n   for (Index_t i = 0 ; i < 8 ; ++i) {\n      pfx[i] = Real_t(0.0);\n      pfy[i] = Real_t(0.0);\n      pfz[i] = Real_t(0.0);\n   }\n   /* evaluate face one: nodes 0, 1, 2, 3 */\n   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],\n                  &pfx[1], &pfy[1], &pfz[1],\n                  &pfx[2], &pfy[2], &pfz[2],\n                  &pfx[3], &pfy[3], &pfz[3],\n                  x[0], y[0], z[0], x[1], y[1], z[1],\n                  x[2], y[2], z[2], x[3], y[3], z[3]);\n   /* evaluate face two: nodes 0, 4, 5, 1 */\n   SumElemFaceNormal(&pfx[0], &pfy[0], &pfz[0],\n                  &pfx[4], &pfy[4], &pfz[4],\n                  &pfx[5], &pfy[5], &pfz[5],\n                  &pfx[1], &pfy[1], &pfz[1],\n                  x[0], y[0], z[0], x[4], y[4], z[4],\n                  x[5], y[5], z[5], x[1], y[1], z[1]);\n   /* evaluate face three: nodes 1, 5, 6, 2 */\n   SumElemFaceNormal(&pfx[1], &pfy[1], &pfz[1],\n                  &pfx[5], &pfy[5], &pfz[5],\n                  &pfx[6], &pfy[6], &pfz[6],\n                  &pfx[2], &pfy[2], &pfz[2],\n                  x[1], y[1], z[1], x[5], y[5], z[5],\n                  x[6], y[6], z[6], x[2], y[2], z[2]);\n   /* evaluate face four: nodes 2, 6, 7, 3 */\n   SumElemFaceNormal(&pfx[2], &pfy[2], &pfz[2],\n                  &pfx[6], &pfy[6], &pfz[6],\n                  &pfx[7], &pfy[7], &pfz[7],\n                  &pfx[3], &pfy[3], &pfz[3],\n                  x[2], y[2], z[2], x[6], y[6], z[6],\n                  x[7], y[7], z[7], x[3], y[3], z[3]);\n   /* evaluate face five: nodes 3, 7, 4, 0 */\n   SumElemFaceNormal(&pfx[3], &pfy[3], &pfz[3],\n                  &pfx[7], &pfy[7], &pfz[7],\n                  &pfx[4], &pfy[4], &pfz[4],\n                  &pfx[0], &pfy[0], &pfz[0],\n                  x[3], y[3], z[3], x[7], y[7], z[7],\n                  x[4], y[4], z[4], x[0], y[0], z[0]);\n   /* evaluate face six: nodes 4, 7, 6, 5 */\n   SumElemFaceNormal(&pfx[4], &pfy[4], &pfz[4],\n                  &pfx[7], &pfy[7], &pfz[7],\n                  &pfx[6], &pfy[6], &pfz[6],\n                  &pfx[5], &pfy[5], &pfz[5],\n                  x[4], y[4], z[4], x[7], y[7], z[7],\n                  x[6], y[6], z[6], x[5], y[5], z[5]);\n}\n\n////static inline\n __attribute__((noinline)) void SumElemStressesToNodeForces( const Real_t B[][8],\n                                  const Real_t stress_xx,\n                                  const Real_t stress_yy,\n                                  const Real_t stress_zz,\n                                  Real_t* const fx,\n                                  Real_t* const fy,\n                                  Real_t* const fz )\n{\n  Real_t pfx0 = B[0][0] ;   Real_t pfx1 = B[0][1] ;\n  Real_t pfx2 = B[0][2] ;   Real_t pfx3 = B[0][3] ;\n  Real_t pfx4 = B[0][4] ;   Real_t pfx5 = B[0][5] ;\n  Real_t pfx6 = B[0][6] ;   Real_t pfx7 = B[0][7] ;\n\n  Real_t pfy0 = B[1][0] ;   Real_t pfy1 = B[1][1] ;\n  Real_t pfy2 = B[1][2] ;   Real_t pfy3 = B[1][3] ;\n  Real_t pfy4 = B[1][4] ;   Real_t pfy5 = B[1][5] ;\n  Real_t pfy6 = B[1][6] ;   Real_t pfy7 = B[1][7] ;\n\n  Real_t pfz0 = B[2][0] ;   Real_t pfz1 = B[2][1] ;\n  Real_t pfz2 = B[2][2] ;   Real_t pfz3 = B[2][3] ;\n  Real_t pfz4 = B[2][4] ;   Real_t pfz5 = B[2][5] ;\n  Real_t pfz6 = B[2][6] ;   Real_t pfz7 = B[2][7] ;\n\n  fx[0] = -( stress_xx * pfx0 );\n  fx[1] = -( stress_xx * pfx1 );\n  fx[2] = -( stress_xx * pfx2 );\n  fx[3] = -( stress_xx * pfx3 );\n  fx[4] = -( stress_xx * pfx4 );\n  fx[5] = -( stress_xx * pfx5 );\n  fx[6] = -( stress_xx * pfx6 );\n  fx[7] = -( stress_xx * pfx7 );\n\n  fy[0] = -( stress_yy * pfy0  );\n  fy[1] = -( stress_yy * pfy1  );\n  fy[2] = -( stress_yy * pfy2  );\n  fy[3] = -( stress_yy * pfy3  );\n  fy[4] = -( stress_yy * pfy4  );\n  fy[5] = -( stress_yy * pfy5  );\n  fy[6] = -( stress_yy * pfy6  );\n  fy[7] = -( stress_yy * pfy7  );\n\n  fz[0] = -( stress_zz * pfz0 );\n  fz[1] = -( stress_zz * pfz1 );\n  fz[2] = -( stress_zz * pfz2 );\n  fz[3] = -( stress_zz * pfz3 );\n  fz[4] = -( stress_zz * pfz4 );\n  fz[5] = -( stress_zz * pfz5 );\n  fz[6] = -( stress_zz * pfz6 );\n  fz[7] = -( stress_zz * pfz7 );\n}\n\n////static inline\n __attribute__((noinline)) void IntegrateStressForElems( Index_t numElem,\n                              Real_t *sigxx, Real_t *sigyy, Real_t *sigzz,\n                              Real_t *determ)\n{\n  Real_t B[3][8] ;// shape function derivatives\n  Real_t x_local[8] ;\n  Real_t y_local[8] ;\n  Real_t z_local[8] ;\n  Real_t fx_local[8] ;\n  Real_t fy_local[8] ;\n  Real_t fz_local[8] ;\n\n  // loop over all elements\n  for( Index_t k=0 ; k<numElem ; ++k )\n  //for( Index_t k=0 ; k<10 ; ++k )\n  {\n    const Index_t* const elemNodes = mesh.nodelist(k);\n\n    // get nodal coordinates from global arrays and copy into local arrays.\n    for( Index_t lnode=0 ; lnode<8 ; ++lnode )\n    {\n      Index_t gnode = elemNodes[lnode];\n      x_local[lnode] = mesh.x(gnode);\n      y_local[lnode] = mesh.y(gnode);\n      z_local[lnode] = mesh.z(gnode);\n    }\n\n    /* Volume calculation involves extra work for numerical consistency. */\n    CalcElemShapeFunctionDerivatives(x_local, y_local, z_local,\n                                         B, &determ[k]);\n\n    CalcElemNodeNormals( B[0] , B[1], B[2],\n                          x_local, y_local, z_local );\n\n    SumElemStressesToNodeForces( B, sigxx[k], sigyy[k], sigzz[k],\n                                         fx_local, fy_local, fz_local ) ;\n\n    // copy nodal force contributions to global force arrray.\n    for( Index_t lnode=0 ; lnode<8 ; ++lnode )\n    {\n      Index_t gnode = elemNodes[lnode];\n      mesh.fx(gnode) += fx_local[lnode];\n      mesh.fy(gnode) += fy_local[lnode];\n      mesh.fz(gnode) += fz_local[lnode];\n    }\n  }\n}\n\n////static inline\n __attribute__((noinline)) void CollectDomainNodesToElemNodes(const Index_t* elemToNode,\n                                   Real_t elemX[8],\n                                   Real_t elemY[8],\n                                   Real_t elemZ[8])\n{\n   Index_t nd0i = elemToNode[0] ;\n   Index_t nd1i = elemToNode[1] ;\n   Index_t nd2i = elemToNode[2] ;\n   Index_t nd3i = elemToNode[3] ;\n   Index_t nd4i = elemToNode[4] ;\n   Index_t nd5i = elemToNode[5] ;\n   Index_t nd6i = elemToNode[6] ;\n   Index_t nd7i = elemToNode[7] ;\n\n   elemX[0] = mesh.x(nd0i);\n   elemX[1] = mesh.x(nd1i);\n   elemX[2] = mesh.x(nd2i);\n   elemX[3] = mesh.x(nd3i);\n   elemX[4] = mesh.x(nd4i);\n   elemX[5] = mesh.x(nd5i);\n   elemX[6] = mesh.x(nd6i);\n   elemX[7] = mesh.x(nd7i);\n\n   elemY[0] = mesh.y(nd0i);\n   elemY[1] = mesh.y(nd1i);\n   elemY[2] = mesh.y(nd2i);\n   elemY[3] = mesh.y(nd3i);\n   elemY[4] = mesh.y(nd4i);\n   elemY[5] = mesh.y(nd5i);\n   elemY[6] = mesh.y(nd6i);\n   elemY[7] = mesh.y(nd7i);\n\n   elemZ[0] = mesh.z(nd0i);\n   elemZ[1] = mesh.z(nd1i);\n   elemZ[2] = mesh.z(nd2i);\n   elemZ[3] = mesh.z(nd3i);\n   elemZ[4] = mesh.z(nd4i);\n   elemZ[5] = mesh.z(nd5i);\n   elemZ[6] = mesh.z(nd6i);\n   elemZ[7] = mesh.z(nd7i);\n\n}\n\n////static inline\n __attribute__((noinline)) void VoluDer(const Real_t x0, const Real_t x1, const Real_t x2,\n             const Real_t x3, const Real_t x4, const Real_t x5,\n             const Real_t y0, const Real_t y1, const Real_t y2,\n             const Real_t y3, const Real_t y4, const Real_t y5,\n             const Real_t z0, const Real_t z1, const Real_t z2,\n             const Real_t z3, const Real_t z4, const Real_t z5,\n             Real_t* dvdx, Real_t* dvdy, Real_t* dvdz)\n{\n   const Real_t twelfth = Real_t(1.0) / Real_t(12.0) ;\n\n   *dvdx =\n      (y1 + y2) * (z0 + z1) - (y0 + y1) * (z1 + z2) +\n      (y0 + y4) * (z3 + z4) - (y3 + y4) * (z0 + z4) -\n      (y2 + y5) * (z3 + z5) + (y3 + y5) * (z2 + z5);\n   *dvdy =\n      - (x1 + x2) * (z0 + z1) + (x0 + x1) * (z1 + z2) -\n      (x0 + x4) * (z3 + z4) + (x3 + x4) * (z0 + z4) +\n      (x2 + x5) * (z3 + z5) - (x3 + x5) * (z2 + z5);\n\n   *dvdz =\n      - (y1 + y2) * (x0 + x1) + (y0 + y1) * (x1 + x2) -\n      (y0 + y4) * (x3 + x4) + (y3 + y4) * (x0 + x4) +\n      (y2 + y5) * (x3 + x5) - (y3 + y5) * (x2 + x5);\n\n   *dvdx *= twelfth;\n   *dvdy *= twelfth;\n   *dvdz *= twelfth;\n}\n\n////static inline\n __attribute__((noinline)) void CalcElemVolumeDerivative(Real_t dvdx[8],\n                              Real_t dvdy[8],\n                              Real_t dvdz[8],\n                              const Real_t x[8],\n                              const Real_t y[8],\n                              const Real_t z[8])\n{\n   VoluDer(x[1], x[2], x[3], x[4], x[5], x[7],\n           y[1], y[2], y[3], y[4], y[5], y[7],\n           z[1], z[2], z[3], z[4], z[5], z[7],\n           &dvdx[0], &dvdy[0], &dvdz[0]);\n   VoluDer(x[0], x[1], x[2], x[7], x[4], x[6],\n           y[0], y[1], y[2], y[7], y[4], y[6],\n           z[0], z[1], z[2], z[7], z[4], z[6],\n           &dvdx[3], &dvdy[3], &dvdz[3]);\n   VoluDer(x[3], x[0], x[1], x[6], x[7], x[5],\n           y[3], y[0], y[1], y[6], y[7], y[5],\n           z[3], z[0], z[1], z[6], z[7], z[5],\n           &dvdx[2], &dvdy[2], &dvdz[2]);\n   VoluDer(x[2], x[3], x[0], x[5], x[6], x[4],\n           y[2], y[3], y[0], y[5], y[6], y[4],\n           z[2], z[3], z[0], z[5], z[6], z[4],\n           &dvdx[1], &dvdy[1], &dvdz[1]);\n   VoluDer(x[7], x[6], x[5], x[0], x[3], x[1],\n           y[7], y[6], y[5], y[0], y[3], y[1],\n           z[7], z[6], z[5], z[0], z[3], z[1],\n           &dvdx[4], &dvdy[4], &dvdz[4]);\n   VoluDer(x[4], x[7], x[6], x[1], x[0], x[2],\n           y[4], y[7], y[6], y[1], y[0], y[2],\n           z[4], z[7], z[6], z[1], z[0], z[2],\n           &dvdx[5], &dvdy[5], &dvdz[5]);\n   VoluDer(x[5], x[4], x[7], x[2], x[1], x[3],\n           y[5], y[4], y[7], y[2], y[1], y[3],\n           z[5], z[4], z[7], z[2], z[1], z[3],\n           &dvdx[6], &dvdy[6], &dvdz[6]);\n   VoluDer(x[6], x[5], x[4], x[3], x[2], x[0],\n           y[6], y[5], y[4], y[3], y[2], y[0],\n           z[6], z[5], z[4], z[3], z[2], z[0],\n           &dvdx[7], &dvdy[7], &dvdz[7]);\n}\n\n//static inline\n __attribute__((noinline)) void CalcElemFBHourglassForce(Real_t *xd, Real_t *yd, Real_t *zd,  Real_t *hourgam0,\n                              Real_t *hourgam1, Real_t *hourgam2, Real_t *hourgam3,\n                              Real_t *hourgam4, Real_t *hourgam5, Real_t *hourgam6,\n                              Real_t *hourgam7, Real_t coefficient,\n                              Real_t *hgfx, Real_t *hgfy, Real_t *hgfz )\n{\n   Index_t i00=0;\n   Index_t i01=1;\n   Index_t i02=2;\n   Index_t i03=3;\n\n   Real_t h00 =\n      hourgam0[i00] * xd[0] + hourgam1[i00] * xd[1] +\n      hourgam2[i00] * xd[2] + hourgam3[i00] * xd[3] +\n      hourgam4[i00] * xd[4] + hourgam5[i00] * xd[5] +\n      hourgam6[i00] * xd[6] + hourgam7[i00] * xd[7];\n\n   Real_t h01 =\n      hourgam0[i01] * xd[0] + hourgam1[i01] * xd[1] +\n      hourgam2[i01] * xd[2] + hourgam3[i01] * xd[3] +\n      hourgam4[i01] * xd[4] + hourgam5[i01] * xd[5] +\n      hourgam6[i01] * xd[6] + hourgam7[i01] * xd[7];\n\n   Real_t h02 =\n      hourgam0[i02] * xd[0] + hourgam1[i02] * xd[1]+\n      hourgam2[i02] * xd[2] + hourgam3[i02] * xd[3]+\n      hourgam4[i02] * xd[4] + hourgam5[i02] * xd[5]+\n      hourgam6[i02] * xd[6] + hourgam7[i02] * xd[7];\n\n   Real_t h03 =\n      hourgam0[i03] * xd[0] + hourgam1[i03] * xd[1] +\n      hourgam2[i03] * xd[2] + hourgam3[i03] * xd[3] +\n      hourgam4[i03] * xd[4] + hourgam5[i03] * xd[5] +\n      hourgam6[i03] * xd[6] + hourgam7[i03] * xd[7];\n\n   hgfx[0] = coefficient *\n      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +\n       hourgam0[i02] * h02 + hourgam0[i03] * h03);\n\n   hgfx[1] = coefficient *\n      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +\n       hourgam1[i02] * h02 + hourgam1[i03] * h03);\n\n   hgfx[2] = coefficient *\n      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +\n       hourgam2[i02] * h02 + hourgam2[i03] * h03);\n\n   hgfx[3] = coefficient *\n      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +\n       hourgam3[i02] * h02 + hourgam3[i03] * h03);\n\n   hgfx[4] = coefficient *\n      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +\n       hourgam4[i02] * h02 + hourgam4[i03] * h03);\n\n   hgfx[5] = coefficient *\n      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +\n       hourgam5[i02] * h02 + hourgam5[i03] * h03);\n\n   hgfx[6] = coefficient *\n      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +\n       hourgam6[i02] * h02 + hourgam6[i03] * h03);\n\n   hgfx[7] = coefficient *\n      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +\n       hourgam7[i02] * h02 + hourgam7[i03] * h03);\n\n   h00 =\n      hourgam0[i00] * yd[0] + hourgam1[i00] * yd[1] +\n      hourgam2[i00] * yd[2] + hourgam3[i00] * yd[3] +\n      hourgam4[i00] * yd[4] + hourgam5[i00] * yd[5] +\n      hourgam6[i00] * yd[6] + hourgam7[i00] * yd[7];\n\n   h01 =\n      hourgam0[i01] * yd[0] + hourgam1[i01] * yd[1] +\n      hourgam2[i01] * yd[2] + hourgam3[i01] * yd[3] +\n      hourgam4[i01] * yd[4] + hourgam5[i01] * yd[5] +\n      hourgam6[i01] * yd[6] + hourgam7[i01] * yd[7];\n\n   h02 =\n      hourgam0[i02] * yd[0] + hourgam1[i02] * yd[1]+\n      hourgam2[i02] * yd[2] + hourgam3[i02] * yd[3]+\n      hourgam4[i02] * yd[4] + hourgam5[i02] * yd[5]+\n      hourgam6[i02] * yd[6] + hourgam7[i02] * yd[7];\n\n   h03 =\n      hourgam0[i03] * yd[0] + hourgam1[i03] * yd[1] +\n      hourgam2[i03] * yd[2] + hourgam3[i03] * yd[3] +\n      hourgam4[i03] * yd[4] + hourgam5[i03] * yd[5] +\n      hourgam6[i03] * yd[6] + hourgam7[i03] * yd[7];\n\n\n   hgfy[0] = coefficient *\n      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +\n       hourgam0[i02] * h02 + hourgam0[i03] * h03);\n\n   hgfy[1] = coefficient *\n      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +\n       hourgam1[i02] * h02 + hourgam1[i03] * h03);\n\n   hgfy[2] = coefficient *\n      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +\n       hourgam2[i02] * h02 + hourgam2[i03] * h03);\n\n   hgfy[3] = coefficient *\n      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +\n       hourgam3[i02] * h02 + hourgam3[i03] * h03);\n\n   hgfy[4] = coefficient *\n      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +\n       hourgam4[i02] * h02 + hourgam4[i03] * h03);\n\n   hgfy[5] = coefficient *\n      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +\n       hourgam5[i02] * h02 + hourgam5[i03] * h03);\n\n   hgfy[6] = coefficient *\n      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +\n       hourgam6[i02] * h02 + hourgam6[i03] * h03);\n\n   hgfy[7] = coefficient *\n      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +\n       hourgam7[i02] * h02 + hourgam7[i03] * h03);\n\n   h00 =\n      hourgam0[i00] * zd[0] + hourgam1[i00] * zd[1] +\n      hourgam2[i00] * zd[2] + hourgam3[i00] * zd[3] +\n      hourgam4[i00] * zd[4] + hourgam5[i00] * zd[5] +\n      hourgam6[i00] * zd[6] + hourgam7[i00] * zd[7];\n\n   h01 =\n      hourgam0[i01] * zd[0] + hourgam1[i01] * zd[1] +\n      hourgam2[i01] * zd[2] + hourgam3[i01] * zd[3] +\n      hourgam4[i01] * zd[4] + hourgam5[i01] * zd[5] +\n      hourgam6[i01] * zd[6] + hourgam7[i01] * zd[7];\n\n   h02 =\n      hourgam0[i02] * zd[0] + hourgam1[i02] * zd[1]+\n      hourgam2[i02] * zd[2] + hourgam3[i02] * zd[3]+\n      hourgam4[i02] * zd[4] + hourgam5[i02] * zd[5]+\n      hourgam6[i02] * zd[6] + hourgam7[i02] * zd[7];\n\n   h03 =\n      hourgam0[i03] * zd[0] + hourgam1[i03] * zd[1] +\n      hourgam2[i03] * zd[2] + hourgam3[i03] * zd[3] +\n      hourgam4[i03] * zd[4] + hourgam5[i03] * zd[5] +\n      hourgam6[i03] * zd[6] + hourgam7[i03] * zd[7];\n\n\n   hgfz[0] = coefficient *\n      (hourgam0[i00] * h00 + hourgam0[i01] * h01 +\n       hourgam0[i02] * h02 + hourgam0[i03] * h03);\n\n   hgfz[1] = coefficient *\n      (hourgam1[i00] * h00 + hourgam1[i01] * h01 +\n       hourgam1[i02] * h02 + hourgam1[i03] * h03);\n\n   hgfz[2] = coefficient *\n      (hourgam2[i00] * h00 + hourgam2[i01] * h01 +\n       hourgam2[i02] * h02 + hourgam2[i03] * h03);\n\n   hgfz[3] = coefficient *\n      (hourgam3[i00] * h00 + hourgam3[i01] * h01 +\n       hourgam3[i02] * h02 + hourgam3[i03] * h03);\n\n   hgfz[4] = coefficient *\n      (hourgam4[i00] * h00 + hourgam4[i01] * h01 +\n       hourgam4[i02] * h02 + hourgam4[i03] * h03);\n\n   hgfz[5] = coefficient *\n      (hourgam5[i00] * h00 + hourgam5[i01] * h01 +\n       hourgam5[i02] * h02 + hourgam5[i03] * h03);\n\n   hgfz[6] = coefficient *\n      (hourgam6[i00] * h00 + hourgam6[i01] * h01 +\n       hourgam6[i02] * h02 + hourgam6[i03] * h03);\n\n   hgfz[7] = coefficient *\n      (hourgam7[i00] * h00 + hourgam7[i01] * h01 +\n       hourgam7[i02] * h02 + hourgam7[i03] * h03);\n}\n\n//static inline\n __attribute__((noinline)) void CalcFBHourglassForceForElems(Real_t *determ,\n            Real_t *x8n,      Real_t *y8n,      Real_t *z8n,\n            Real_t *dvdx,     Real_t *dvdy,     Real_t *dvdz,\n            Real_t hourg)\n{\n   /*************************************************\n    *\n    *     FUNCTION: Calculates the Flanagan-Belytschko anti-hourglass\n    *               force.\n    *\n    *************************************************/\n\n   Index_t numElem = mesh.numElem() ;\n\n   Real_t hgfx[8], hgfy[8], hgfz[8] ;\n\n   Real_t coefficient;\n\n   Real_t  gamma[4][8];\n   Real_t hourgam0[4], hourgam1[4], hourgam2[4], hourgam3[4] ;\n   Real_t hourgam4[4], hourgam5[4], hourgam6[4], hourgam7[4];\n   Real_t xd1[8], yd1[8], zd1[8] ;\n\n   gamma[0][0] = Real_t( 1.);\n   gamma[0][1] = Real_t( 1.);\n   gamma[0][2] = Real_t(-1.);\n   gamma[0][3] = Real_t(-1.);\n   gamma[0][4] = Real_t(-1.);\n   gamma[0][5] = Real_t(-1.);\n   gamma[0][6] = Real_t( 1.);\n   gamma[0][7] = Real_t( 1.);\n   gamma[1][0] = Real_t( 1.);\n   gamma[1][1] = Real_t(-1.);\n   gamma[1][2] = Real_t(-1.);\n   gamma[1][3] = Real_t( 1.);\n   gamma[1][4] = Real_t(-1.);\n   gamma[1][5] = Real_t( 1.);\n   gamma[1][6] = Real_t( 1.);\n   gamma[1][7] = Real_t(-1.);\n   gamma[2][0] = Real_t( 1.);\n   gamma[2][1] = Real_t(-1.);\n   gamma[2][2] = Real_t( 1.);\n   gamma[2][3] = Real_t(-1.);\n   gamma[2][4] = Real_t( 1.);\n   gamma[2][5] = Real_t(-1.);\n   gamma[2][6] = Real_t( 1.);\n   gamma[2][7] = Real_t(-1.);\n   gamma[3][0] = Real_t(-1.);\n   gamma[3][1] = Real_t( 1.);\n   gamma[3][2] = Real_t(-1.);\n   gamma[3][3] = Real_t( 1.);\n   gamma[3][4] = Real_t( 1.);\n   gamma[3][5] = Real_t(-1.);\n   gamma[3][6] = Real_t( 1.);\n   gamma[3][7] = Real_t(-1.);\n\n/*************************************************/\n/*    compute the hourglass modes */\n\n\n   for(Index_t i2=0;i2<numElem;++i2){\n      const Index_t *elemToNode = mesh.nodelist(i2);\n      Index_t i3=8*i2;\n      Real_t volinv=Real_t(1.0)/determ[i2];\n      Real_t ss1, mass1, volume13 ;\n      for(Index_t i1=0;i1<4;++i1){\n\n         Real_t hourmodx =\n            x8n[i3] * gamma[i1][0] + x8n[i3+1] * gamma[i1][1] +\n            x8n[i3+2] * gamma[i1][2] + x8n[i3+3] * gamma[i1][3] +\n            x8n[i3+4] * gamma[i1][4] + x8n[i3+5] * gamma[i1][5] +\n            x8n[i3+6] * gamma[i1][6] + x8n[i3+7] * gamma[i1][7];\n\n         Real_t hourmody =\n            y8n[i3] * gamma[i1][0] + y8n[i3+1] * gamma[i1][1] +\n            y8n[i3+2] * gamma[i1][2] + y8n[i3+3] * gamma[i1][3] +\n            y8n[i3+4] * gamma[i1][4] + y8n[i3+5] * gamma[i1][5] +\n            y8n[i3+6] * gamma[i1][6] + y8n[i3+7] * gamma[i1][7];\n\n         Real_t hourmodz =\n            z8n[i3] * gamma[i1][0] + z8n[i3+1] * gamma[i1][1] +\n            z8n[i3+2] * gamma[i1][2] + z8n[i3+3] * gamma[i1][3] +\n            z8n[i3+4] * gamma[i1][4] + z8n[i3+5] * gamma[i1][5] +\n            z8n[i3+6] * gamma[i1][6] + z8n[i3+7] * gamma[i1][7];\n\n         hourgam0[i1] = gamma[i1][0] -  volinv*(dvdx[i3  ] * hourmodx +\n                                                  dvdy[i3  ] * hourmody +\n                                                  dvdz[i3  ] * hourmodz );\n\n         hourgam1[i1] = gamma[i1][1] -  volinv*(dvdx[i3+1] * hourmodx +\n                                                  dvdy[i3+1] * hourmody +\n                                                  dvdz[i3+1] * hourmodz );\n\n         hourgam2[i1] = gamma[i1][2] -  volinv*(dvdx[i3+2] * hourmodx +\n                                                  dvdy[i3+2] * hourmody +\n                                                  dvdz[i3+2] * hourmodz );\n\n         hourgam3[i1] = gamma[i1][3] -  volinv*(dvdx[i3+3] * hourmodx +\n                                                  dvdy[i3+3] * hourmody +\n                                                  dvdz[i3+3] * hourmodz );\n\n         hourgam4[i1] = gamma[i1][4] -  volinv*(dvdx[i3+4] * hourmodx +\n                                                  dvdy[i3+4] * hourmody +\n                                                  dvdz[i3+4] * hourmodz );\n\n         hourgam5[i1] = gamma[i1][5] -  volinv*(dvdx[i3+5] * hourmodx +\n                                                  dvdy[i3+5] * hourmody +\n                                                  dvdz[i3+5] * hourmodz );\n\n         hourgam6[i1] = gamma[i1][6] -  volinv*(dvdx[i3+6] * hourmodx +\n                                                  dvdy[i3+6] * hourmody +\n                                                  dvdz[i3+6] * hourmodz );\n\n         hourgam7[i1] = gamma[i1][7] -  volinv*(dvdx[i3+7] * hourmodx +\n                                                  dvdy[i3+7] * hourmody +\n                                                  dvdz[i3+7] * hourmodz );\n\n      }\n\n      /* compute forces */\n      /* store forces into h arrays (force arrays) */\n\n      ss1=mesh.ss(i2);\n      mass1=mesh.elemMass(i2);\n      volume13=CBRT(determ[i2]);\n\n      Index_t n0si2 = elemToNode[0];\n      Index_t n1si2 = elemToNode[1];\n      Index_t n2si2 = elemToNode[2];\n      Index_t n3si2 = elemToNode[3];\n      Index_t n4si2 = elemToNode[4];\n      Index_t n5si2 = elemToNode[5];\n      Index_t n6si2 = elemToNode[6];\n      Index_t n7si2 = elemToNode[7];\n\n      xd1[0] = mesh.xd(n0si2);\n      xd1[1] = mesh.xd(n1si2);\n      xd1[2] = mesh.xd(n2si2);\n      xd1[3] = mesh.xd(n3si2);\n      xd1[4] = mesh.xd(n4si2);\n      xd1[5] = mesh.xd(n5si2);\n      xd1[6] = mesh.xd(n6si2);\n      xd1[7] = mesh.xd(n7si2);\n\n      yd1[0] = mesh.yd(n0si2);\n      yd1[1] = mesh.yd(n1si2);\n      yd1[2] = mesh.yd(n2si2);\n      yd1[3] = mesh.yd(n3si2);\n      yd1[4] = mesh.yd(n4si2);\n      yd1[5] = mesh.yd(n5si2);\n      yd1[6] = mesh.yd(n6si2);\n      yd1[7] = mesh.yd(n7si2);\n\n      zd1[0] = mesh.zd(n0si2);\n      zd1[1] = mesh.zd(n1si2);\n      zd1[2] = mesh.zd(n2si2);\n      zd1[3] = mesh.zd(n3si2);\n      zd1[4] = mesh.zd(n4si2);\n      zd1[5] = mesh.zd(n5si2);\n      zd1[6] = mesh.zd(n6si2);\n      zd1[7] = mesh.zd(n7si2);\n\n      coefficient = - hourg * Real_t(0.01) * ss1 * mass1 / volume13;\n\n      CalcElemFBHourglassForce(xd1,yd1,zd1,\n                      hourgam0,hourgam1,hourgam2,hourgam3,\n                      hourgam4,hourgam5,hourgam6,hourgam7,\n                      coefficient, hgfx, hgfy, hgfz);\n\n      mesh.fx(n0si2) += hgfx[0];\n      mesh.fy(n0si2) += hgfy[0];\n      mesh.fz(n0si2) += hgfz[0];\n\n      mesh.fx(n1si2) += hgfx[1];\n      mesh.fy(n1si2) += hgfy[1];\n      mesh.fz(n1si2) += hgfz[1];\n\n      mesh.fx(n2si2) += hgfx[2];\n      mesh.fy(n2si2) += hgfy[2];\n      mesh.fz(n2si2) += hgfz[2];\n\n      mesh.fx(n3si2) += hgfx[3];\n      mesh.fy(n3si2) += hgfy[3];\n      mesh.fz(n3si2) += hgfz[3];\n\n      mesh.fx(n4si2) += hgfx[4];\n      mesh.fy(n4si2) += hgfy[4];\n      mesh.fz(n4si2) += hgfz[4];\n\n      mesh.fx(n5si2) += hgfx[5];\n      mesh.fy(n5si2) += hgfy[5];\n      mesh.fz(n5si2) += hgfz[5];\n\n      mesh.fx(n6si2) += hgfx[6];\n      mesh.fy(n6si2) += hgfy[6];\n      mesh.fz(n6si2) += hgfz[6];\n\n      mesh.fx(n7si2) += hgfx[7];\n      mesh.fy(n7si2) += hgfy[7];\n      mesh.fz(n7si2) += hgfz[7];\n   }\n}\n\n//static inline\n __attribute__((noinline)) void CalcHourglassControlForElems(Real_t determ[], Real_t hgcoef)\n{\n   Index_t i, ii, jj ;\n   Real_t  x1[8],  y1[8],  z1[8] ;\n   Real_t pfx[8], pfy[8], pfz[8] ;\n   Index_t numElem = mesh.numElem() ;\n   Index_t numElem8 = numElem * 8 ;\n   Real_t *dvdx = Allocate<Real_t>(numElem8) ;\n   Real_t *dvdy = Allocate<Real_t>(numElem8) ;\n   Real_t *dvdz = Allocate<Real_t>(numElem8) ;\n   Real_t *x8n  = Allocate<Real_t>(numElem8) ;\n   Real_t *y8n  = Allocate<Real_t>(numElem8) ;\n   Real_t *z8n  = Allocate<Real_t>(numElem8) ;\n\n   /* start loop over elements */\n   for (i=0 ; i<numElem ; ++i){\n\n      Index_t* elemToNode = mesh.nodelist(i);\n      CollectDomainNodesToElemNodes(elemToNode, x1, y1, z1);\n\n      CalcElemVolumeDerivative(pfx, pfy, pfz, x1, y1, z1);\n\n      /* load into temporary storage for FB Hour Glass control */\n      for(ii=0;ii<8;++ii){\n         jj=8*i+ii;\n\n         dvdx[jj] = pfx[ii];\n         dvdy[jj] = pfy[ii];\n         dvdz[jj] = pfz[ii];\n         x8n[jj]  = x1[ii];\n         y8n[jj]  = y1[ii];\n         z8n[jj]  = z1[ii];\n      }\n\n      determ[i] = mesh.volo(i) * mesh.v(i);\n\n      /* Do a check for negative volumes */\n      if ( mesh.v(i) <= Real_t(0.0) ) {\n         exit(VolumeError) ;\n      }\n   }\n\n   if ( hgcoef > Real_t(0.) ) {\n      CalcFBHourglassForceForElems(determ,x8n,y8n,z8n,dvdx,dvdy,dvdz,hgcoef) ;\n   }\n\n   Release(&z8n) ;\n   Release(&y8n) ;\n   Release(&x8n) ;\n   Release(&dvdz) ;\n   Release(&dvdy) ;\n   Release(&dvdx) ;\n\n   return ;\n}\n\n//static inline\n __attribute__((noinline))  __attribute__((noinline))void CalcVolumeForceForElems()\n{\n   Index_t numElem = mesh.numElem() ;\n   if (numElem != 0) {\n      Real_t  hgcoef = mesh.hgcoef() ;\n      Real_t *sigxx  = Allocate<Real_t>(numElem) ;\n      Real_t *sigyy  = Allocate<Real_t>(numElem) ;\n      Real_t *sigzz  = Allocate<Real_t>(numElem) ;\n      Real_t *determ = Allocate<Real_t>(numElem) ;\n\n      /* Sum contributions to total stress tensor */\n      InitStressTermsForElems(numElem, sigxx, sigyy, sigzz);\n\n      // call elemlib stress integration loop to produce nodal forces from\n      // material stresses.\n      IntegrateStressForElems( numElem, sigxx, sigyy, sigzz, determ) ;\n\n      // check for negative element volume\n      for ( Index_t k=0 ; k<numElem ; ++k ) {\n         if (determ[k] <= Real_t(0.0)) {\n            exit(VolumeError) ;\n         }\n      }\n\n      CalcHourglassControlForElems(determ, hgcoef) ;\n\n      Release(&determ) ;\n      Release(&sigzz) ;\n      Release(&sigyy) ;\n      Release(&sigxx) ;\n   }\n}\n\n//static inline\n __attribute__((noinline)) void CalcForceForNodes()\n{\n  Index_t numNode = mesh.numNode() ;\n  for (Index_t i=0; i<numNode; ++i) {\n     mesh.fx(i) = Real_t(0.0) ;\n     mesh.fy(i) = Real_t(0.0) ;\n     mesh.fz(i) = Real_t(0.0) ;\n  }\n\n  /* Calcforce calls partial, force, hourq */\n  CalcVolumeForceForElems() ;\n\n  /* Calculate Nodal Forces at domain boundaries */\n  /* problem->commSBN->Transfer(CommSBN::forces); */\n\n}\n\n//static inline\n __attribute__((noinline)) void CalcAccelerationForNodes()\n{\n   Index_t numNode = mesh.numNode() ;\n   for (Index_t i = 0; i < numNode; ++i) {\n      mesh.xdd(i) = mesh.fx(i) / mesh.nodalMass(i);\n      mesh.ydd(i) = mesh.fy(i) / mesh.nodalMass(i);\n      mesh.zdd(i) = mesh.fz(i) / mesh.nodalMass(i);\n   }\n}\n\n//static inline\n __attribute__((noinline)) void ApplyAccelerationBoundaryConditionsForNodes()\n{\n  Index_t numNodeBC = (mesh.sizeX()+1)*(mesh.sizeX()+1) ;\n  for(Index_t i=0 ; i<numNodeBC ; ++i)\n     mesh.xdd(mesh.symmX(i)) = Real_t(0.0) ;\n\n  for(Index_t i=0 ; i<numNodeBC ; ++i)\n     mesh.ydd(mesh.symmY(i)) = Real_t(0.0) ;\n\n  for(Index_t i=0 ; i<numNodeBC ; ++i)\n     mesh.zdd(mesh.symmZ(i)) = Real_t(0.0) ;\n}\n\n//static inline\n __attribute__((noinline)) void CalcVelocityForNodes(const Real_t dt, const Real_t u_cut)\n{\n   Index_t numNode = mesh.numNode() ;\n\n   for ( Index_t i = 0 ; i < numNode ; ++i )\n   {\n     Real_t xdtmp, ydtmp, zdtmp ;\n\n     xdtmp = mesh.xd(i) + mesh.xdd(i) * dt ;\n     if( FABS(xdtmp) < u_cut ) xdtmp = Real_t(0.0);\n     mesh.xd(i) = xdtmp ;\n\n     ydtmp = mesh.yd(i) + mesh.ydd(i) * dt ;\n     if( FABS(ydtmp) < u_cut ) ydtmp = Real_t(0.0);\n     mesh.yd(i) = ydtmp ;\n\n     zdtmp = mesh.zd(i) + mesh.zdd(i) * dt ;\n     if( FABS(zdtmp) < u_cut ) zdtmp = Real_t(0.0);\n     mesh.zd(i) = zdtmp ;\n   }\n}\n\n//static inline\n __attribute__((noinline)) void CalcPositionForNodes(const Real_t dt)\n{\n   Index_t numNode = mesh.numNode() ;\n\n   for ( Index_t i = 0 ; i < numNode ; ++i )\n   {\n     mesh.x(i) += mesh.xd(i) * dt ;\n     mesh.y(i) += mesh.yd(i) * dt ;\n     mesh.z(i) += mesh.zd(i) * dt ;\n   }\n}\n\n//static inline\n __attribute__((noinline)) void LagrangeNodal()\n{\n  const Real_t delt = mesh.deltatime() ;\n  Real_t u_cut = mesh.u_cut() ;\n\n  /* time of boundary condition evaluation is beginning of step for force and\n   * acceleration boundary conditions. */\n  CalcForceForNodes();\n\n  CalcAccelerationForNodes();\n\n  ApplyAccelerationBoundaryConditionsForNodes();\n\n  CalcVelocityForNodes( delt, u_cut ) ;\n\n  CalcPositionForNodes( delt );\n\n  return;\n}\n\n//static inline\n __attribute__((noinline)) Real_t CalcElemVolume( const Real_t x0, const Real_t x1,\n               const Real_t x2, const Real_t x3,\n               const Real_t x4, const Real_t x5,\n               const Real_t x6, const Real_t x7,\n               const Real_t y0, const Real_t y1,\n               const Real_t y2, const Real_t y3,\n               const Real_t y4, const Real_t y5,\n               const Real_t y6, const Real_t y7,\n               const Real_t z0, const Real_t z1,\n               const Real_t z2, const Real_t z3,\n               const Real_t z4, const Real_t z5,\n               const Real_t z6, const Real_t z7 )\n{\n  Real_t twelveth = Real_t(1.0)/Real_t(12.0);\n\n  Real_t dx61 = x6 - x1;\n  Real_t dy61 = y6 - y1;\n  Real_t dz61 = z6 - z1;\n\n  Real_t dx70 = x7 - x0;\n  Real_t dy70 = y7 - y0;\n  Real_t dz70 = z7 - z0;\n\n  Real_t dx63 = x6 - x3;\n  Real_t dy63 = y6 - y3;\n  Real_t dz63 = z6 - z3;\n\n  Real_t dx20 = x2 - x0;\n  Real_t dy20 = y2 - y0;\n  Real_t dz20 = z2 - z0;\n\n  Real_t dx50 = x5 - x0;\n  Real_t dy50 = y5 - y0;\n  Real_t dz50 = z5 - z0;\n\n  Real_t dx64 = x6 - x4;\n  Real_t dy64 = y6 - y4;\n  Real_t dz64 = z6 - z4;\n\n  Real_t dx31 = x3 - x1;\n  Real_t dy31 = y3 - y1;\n  Real_t dz31 = z3 - z1;\n\n  Real_t dx72 = x7 - x2;\n  Real_t dy72 = y7 - y2;\n  Real_t dz72 = z7 - z2;\n\n  Real_t dx43 = x4 - x3;\n  Real_t dy43 = y4 - y3;\n  Real_t dz43 = z4 - z3;\n\n  Real_t dx57 = x5 - x7;\n  Real_t dy57 = y5 - y7;\n  Real_t dz57 = z5 - z7;\n\n  Real_t dx14 = x1 - x4;\n  Real_t dy14 = y1 - y4;\n  Real_t dz14 = z1 - z4;\n\n  Real_t dx25 = x2 - x5;\n  Real_t dy25 = y2 - y5;\n  Real_t dz25 = z2 - z5;\n\n#define TRIPLE_PRODUCT(x1, y1, z1, x2, y2, z2, x3, y3, z3) \\\n   ((x1)*((y2)*(z3) - (z2)*(y3)) + (x2)*((z1)*(y3) - (y1)*(z3)) + (x3)*((y1)*(z2) - (z1)*(y2)))\n\n  Real_t volume =\n    TRIPLE_PRODUCT(dx31 + dx72, dx63, dx20,\n       dy31 + dy72, dy63, dy20,\n       dz31 + dz72, dz63, dz20) +\n    TRIPLE_PRODUCT(dx43 + dx57, dx64, dx70,\n       dy43 + dy57, dy64, dy70,\n       dz43 + dz57, dz64, dz70) +\n    TRIPLE_PRODUCT(dx14 + dx25, dx61, dx50,\n       dy14 + dy25, dy61, dy50,\n       dz14 + dz25, dz61, dz50);\n\n#undef TRIPLE_PRODUCT\n\n  volume *= twelveth;\n\n  return volume ;\n}\n\n//static inline\n __attribute__((noinline)) Real_t CalcElemVolume( const Real_t x[8], const Real_t y[8], const Real_t z[8] )\n{\nreturn CalcElemVolume( x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],\n                       y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7],\n                       z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7]);\n}\n\n//static inline\n __attribute__((noinline)) Real_t AreaFace( const Real_t x0, const Real_t x1,\n                 const Real_t x2, const Real_t x3,\n                 const Real_t y0, const Real_t y1,\n                 const Real_t y2, const Real_t y3,\n                 const Real_t z0, const Real_t z1,\n                 const Real_t z2, const Real_t z3)\n{\n   Real_t fx = (x2 - x0) - (x3 - x1);\n   Real_t fy = (y2 - y0) - (y3 - y1);\n   Real_t fz = (z2 - z0) - (z3 - z1);\n   Real_t gx = (x2 - x0) + (x3 - x1);\n   Real_t gy = (y2 - y0) + (y3 - y1);\n   Real_t gz = (z2 - z0) + (z3 - z1);\n   Real_t area =\n      (fx * fx + fy * fy + fz * fz) *\n      (gx * gx + gy * gy + gz * gz) -\n      (fx * gx + fy * gy + fz * gz) *\n      (fx * gx + fy * gy + fz * gz);\n   return area ;\n}\n\n//static inline\n __attribute__((noinline)) Real_t CalcElemCharacteristicLength( const Real_t x[8],\n                                     const Real_t y[8],\n                                     const Real_t z[8],\n                                     const Real_t volume)\n{\n   Real_t a, charLength = Real_t(0.0);\n\n   a = AreaFace(x[0],x[1],x[2],x[3],\n                y[0],y[1],y[2],y[3],\n                z[0],z[1],z[2],z[3]) ;\n   charLength = std::max(a,charLength) ;\n\n   a = AreaFace(x[4],x[5],x[6],x[7],\n                y[4],y[5],y[6],y[7],\n                z[4],z[5],z[6],z[7]) ;\n   charLength = std::max(a,charLength) ;\n\n   a = AreaFace(x[0],x[1],x[5],x[4],\n                y[0],y[1],y[5],y[4],\n                z[0],z[1],z[5],z[4]) ;\n   charLength = std::max(a,charLength) ;\n\n   a = AreaFace(x[1],x[2],x[6],x[5],\n                y[1],y[2],y[6],y[5],\n                z[1],z[2],z[6],z[5]) ;\n   charLength = std::max(a,charLength) ;\n\n   a = AreaFace(x[2],x[3],x[7],x[6],\n                y[2],y[3],y[7],y[6],\n                z[2],z[3],z[7],z[6]) ;\n   charLength = std::max(a,charLength) ;\n\n   a = AreaFace(x[3],x[0],x[4],x[7],\n                y[3],y[0],y[4],y[7],\n                z[3],z[0],z[4],z[7]) ;\n   charLength = std::max(a,charLength) ;\n\n   charLength = Real_t(4.0) * volume / SQRT(charLength);\n\n   return charLength;\n}\n\n//static inline\n __attribute__((noinline)) void CalcElemVelocityGrandient( const Real_t* const xvel,\n                                const Real_t* const yvel,\n                                const Real_t* const zvel,\n                                const Real_t b[][8],\n                                const Real_t detJ,\n                                Real_t* const d )\n{\n  const Real_t inv_detJ = Real_t(1.0) / detJ ;\n  Real_t dyddx, dxddy, dzddx, dxddz, dzddy, dyddz;\n  const Real_t* const pfx = b[0];\n  const Real_t* const pfy = b[1];\n  const Real_t* const pfz = b[2];\n\n  d[0] = inv_detJ * ( pfx[0] * (xvel[0]-xvel[6])\n                     + pfx[1] * (xvel[1]-xvel[7])\n                     + pfx[2] * (xvel[2]-xvel[4])\n                     + pfx[3] * (xvel[3]-xvel[5]) );\n\n  d[1] = inv_detJ * ( pfy[0] * (yvel[0]-yvel[6])\n                     + pfy[1] * (yvel[1]-yvel[7])\n                     + pfy[2] * (yvel[2]-yvel[4])\n                     + pfy[3] * (yvel[3]-yvel[5]) );\n\n  d[2] = inv_detJ * ( pfz[0] * (zvel[0]-zvel[6])\n                     + pfz[1] * (zvel[1]-zvel[7])\n                     + pfz[2] * (zvel[2]-zvel[4])\n                     + pfz[3] * (zvel[3]-zvel[5]) );\n\n  dyddx  = inv_detJ * ( pfx[0] * (yvel[0]-yvel[6])\n                      + pfx[1] * (yvel[1]-yvel[7])\n                      + pfx[2] * (yvel[2]-yvel[4])\n                      + pfx[3] * (yvel[3]-yvel[5]) );\n\n  dxddy  = inv_detJ * ( pfy[0] * (xvel[0]-xvel[6])\n                      + pfy[1] * (xvel[1]-xvel[7])\n                      + pfy[2] * (xvel[2]-xvel[4])\n                      + pfy[3] * (xvel[3]-xvel[5]) );\n\n  dzddx  = inv_detJ * ( pfx[0] * (zvel[0]-zvel[6])\n                      + pfx[1] * (zvel[1]-zvel[7])\n                      + pfx[2] * (zvel[2]-zvel[4])\n                      + pfx[3] * (zvel[3]-zvel[5]) );\n\n  dxddz  = inv_detJ * ( pfz[0] * (xvel[0]-xvel[6])\n                      + pfz[1] * (xvel[1]-xvel[7])\n                      + pfz[2] * (xvel[2]-xvel[4])\n                      + pfz[3] * (xvel[3]-xvel[5]) );\n\n  dzddy  = inv_detJ * ( pfy[0] * (zvel[0]-zvel[6])\n                      + pfy[1] * (zvel[1]-zvel[7])\n                      + pfy[2] * (zvel[2]-zvel[4])\n                      + pfy[3] * (zvel[3]-zvel[5]) );\n\n  dyddz  = inv_detJ * ( pfz[0] * (yvel[0]-yvel[6])\n                      + pfz[1] * (yvel[1]-yvel[7])\n                      + pfz[2] * (yvel[2]-yvel[4])\n                      + pfz[3] * (yvel[3]-yvel[5]) );\n  d[5]  = Real_t( .5) * ( dxddy + dyddx );\n  d[4]  = Real_t( .5) * ( dxddz + dzddx );\n  d[3]  = Real_t( .5) * ( dzddy + dyddz );\n}\n\n//static inline\n __attribute__((noinline)) void CalcKinematicsForElems( Index_t numElem, Real_t dt )\n{\n  Real_t B[3][8] ; /** shape function derivatives */\n  Real_t D[6] ;\n  Real_t x_local[8] ;\n  Real_t y_local[8] ;\n  Real_t z_local[8] ;\n  Real_t xd_local[8] ;\n  Real_t yd_local[8] ;\n  Real_t zd_local[8] ;\n  Real_t detJ = Real_t(0.0) ;\n\n  // loop over all elements\n  for( Index_t k=0 ; k<numElem ; ++k )\n  {\n    Real_t volume ;\n    Real_t relativeVolume ;\n    const Index_t* const elemToNode = mesh.nodelist(k) ;\n\n    // get nodal coordinates from global arrays and copy into local arrays.\n    for( Index_t lnode=0 ; lnode<8 ; ++lnode )\n    {\n      Index_t gnode = elemToNode[lnode];\n      x_local[lnode] = mesh.x(gnode);\n      y_local[lnode] = mesh.y(gnode);\n      z_local[lnode] = mesh.z(gnode);\n    }\n\n    // volume calculations\n    volume = CalcElemVolume(x_local, y_local, z_local );\n    relativeVolume = volume / mesh.volo(k) ;\n    mesh.vnew(k) = relativeVolume ;\n    mesh.delv(k) = relativeVolume - mesh.v(k) ;\n\n    // set characteristic length\n    mesh.arealg(k) = CalcElemCharacteristicLength(x_local,\n                                                  y_local,\n                                                  z_local,\n                                                  volume);\n\n    // get nodal velocities from global array and copy into local arrays.\n    for( Index_t lnode=0 ; lnode<8 ; ++lnode )\n    {\n      Index_t gnode = elemToNode[lnode];\n      xd_local[lnode] = mesh.xd(gnode);\n      yd_local[lnode] = mesh.yd(gnode);\n      zd_local[lnode] = mesh.zd(gnode);\n    }\n\n    Real_t dt2 = Real_t(0.5) * dt;\n    for ( Index_t j=0 ; j<8 ; ++j )\n    {\n       x_local[j] -= dt2 * xd_local[j];\n       y_local[j] -= dt2 * yd_local[j];\n       z_local[j] -= dt2 * zd_local[j];\n    }\n\n    CalcElemShapeFunctionDerivatives( x_local,\n                                          y_local,\n                                          z_local,\n                                          B, &detJ );\n\n    CalcElemVelocityGrandient( xd_local,\n                               yd_local,\n                               zd_local,\n                               B, detJ, D );\n\n    // put velocity gradient quantities into their global arrays.\n    mesh.dxx(k) = D[0];\n    mesh.dyy(k) = D[1];\n    mesh.dzz(k) = D[2];\n  }\n}\n\n//static inline\n __attribute__((noinline)) void CalcLagrangeElements(Real_t deltatime)\n{\n   Index_t numElem = mesh.numElem() ;\n   if (numElem > 0) {\n      // set element connectivity array as a single dimension array. It is\n      // assumed that the array will be of length numelems*numnodesperelem.\n\n      CalcKinematicsForElems(numElem, deltatime) ;\n\n      // element loop to do some stuff not included in the elemlib function.\n      for ( Index_t k=0 ; k<numElem ; ++k )\n      {\n        // calc strain rate and apply as constraint (only done in FB element)\n        Real_t vdov = mesh.dxx(k) + mesh.dyy(k) + mesh.dzz(k) ;\n        Real_t vdovthird = vdov/Real_t(3.0) ;\n        \n        // make the rate of deformation tensor deviatoric\n        mesh.vdov(k) = vdov ;\n        mesh.dxx(k) -= vdovthird ;\n        mesh.dyy(k) -= vdovthird ;\n        mesh.dzz(k) -= vdovthird ;\n\n        // See if any volumes are negative, and take appropriate action.\n        if (mesh.vnew(k) <= Real_t(0.0))\n        {\n           exit(VolumeError) ;\n        }\n      }\n   }\n}\n\n//static inline\n __attribute__((noinline)) void CalcMonotonicQGradientsForElems()\n{\n#define SUM4(a,b,c,d) (a + b + c + d)\n   Index_t numElem = mesh.numElem() ;\n   const Real_t ptiny = Real_t(1.e-36) ;\n\n   for (Index_t i = 0 ; i < numElem ; ++i ) {\n      Real_t ax,ay,az ;\n      Real_t dxv,dyv,dzv ;\n\n      const Index_t *elemToNode = mesh.nodelist(i);\n      Index_t n0 = elemToNode[0] ;\n      Index_t n1 = elemToNode[1] ;\n      Index_t n2 = elemToNode[2] ;\n      Index_t n3 = elemToNode[3] ;\n      Index_t n4 = elemToNode[4] ;\n      Index_t n5 = elemToNode[5] ;\n      Index_t n6 = elemToNode[6] ;\n      Index_t n7 = elemToNode[7] ;\n\n      Real_t x0 = mesh.x(n0) ;\n      Real_t x1 = mesh.x(n1) ;\n      Real_t x2 = mesh.x(n2) ;\n      Real_t x3 = mesh.x(n3) ;\n      Real_t x4 = mesh.x(n4) ;\n      Real_t x5 = mesh.x(n5) ;\n      Real_t x6 = mesh.x(n6) ;\n      Real_t x7 = mesh.x(n7) ;\n\n      Real_t y0 = mesh.y(n0) ;\n      Real_t y1 = mesh.y(n1) ;\n      Real_t y2 = mesh.y(n2) ;\n      Real_t y3 = mesh.y(n3) ;\n      Real_t y4 = mesh.y(n4) ;\n      Real_t y5 = mesh.y(n5) ;\n      Real_t y6 = mesh.y(n6) ;\n      Real_t y7 = mesh.y(n7) ;\n\n      Real_t z0 = mesh.z(n0) ;\n      Real_t z1 = mesh.z(n1) ;\n      Real_t z2 = mesh.z(n2) ;\n      Real_t z3 = mesh.z(n3) ;\n      Real_t z4 = mesh.z(n4) ;\n      Real_t z5 = mesh.z(n5) ;\n      Real_t z6 = mesh.z(n6) ;\n      Real_t z7 = mesh.z(n7) ;\n\n      Real_t xv0 = mesh.xd(n0) ;\n      Real_t xv1 = mesh.xd(n1) ;\n      Real_t xv2 = mesh.xd(n2) ;\n      Real_t xv3 = mesh.xd(n3) ;\n      Real_t xv4 = mesh.xd(n4) ;\n      Real_t xv5 = mesh.xd(n5) ;\n      Real_t xv6 = mesh.xd(n6) ;\n      Real_t xv7 = mesh.xd(n7) ;\n\n      Real_t yv0 = mesh.yd(n0) ;\n      Real_t yv1 = mesh.yd(n1) ;\n      Real_t yv2 = mesh.yd(n2) ;\n      Real_t yv3 = mesh.yd(n3) ;\n      Real_t yv4 = mesh.yd(n4) ;\n      Real_t yv5 = mesh.yd(n5) ;\n      Real_t yv6 = mesh.yd(n6) ;\n      Real_t yv7 = mesh.yd(n7) ;\n\n      Real_t zv0 = mesh.zd(n0) ;\n      Real_t zv1 = mesh.zd(n1) ;\n      Real_t zv2 = mesh.zd(n2) ;\n      Real_t zv3 = mesh.zd(n3) ;\n      Real_t zv4 = mesh.zd(n4) ;\n      Real_t zv5 = mesh.zd(n5) ;\n      Real_t zv6 = mesh.zd(n6) ;\n      Real_t zv7 = mesh.zd(n7) ;\n\n      Real_t vol = mesh.volo(i)*mesh.vnew(i) ;\n      Real_t norm = Real_t(1.0) / ( vol + ptiny ) ;\n\n      Real_t dxj = Real_t(-0.25)*(SUM4(x0,x1,x5,x4) - SUM4(x3,x2,x6,x7)) ;\n      Real_t dyj = Real_t(-0.25)*(SUM4(y0,y1,y5,y4) - SUM4(y3,y2,y6,y7)) ;\n      Real_t dzj = Real_t(-0.25)*(SUM4(z0,z1,z5,z4) - SUM4(z3,z2,z6,z7)) ;\n\n      Real_t dxi = Real_t( 0.25)*(SUM4(x1,x2,x6,x5) - SUM4(x0,x3,x7,x4)) ;\n      Real_t dyi = Real_t( 0.25)*(SUM4(y1,y2,y6,y5) - SUM4(y0,y3,y7,y4)) ;\n      Real_t dzi = Real_t( 0.25)*(SUM4(z1,z2,z6,z5) - SUM4(z0,z3,z7,z4)) ;\n\n      Real_t dxk = Real_t( 0.25)*(SUM4(x4,x5,x6,x7) - SUM4(x0,x1,x2,x3)) ;\n      Real_t dyk = Real_t( 0.25)*(SUM4(y4,y5,y6,y7) - SUM4(y0,y1,y2,y3)) ;\n      Real_t dzk = Real_t( 0.25)*(SUM4(z4,z5,z6,z7) - SUM4(z0,z1,z2,z3)) ;\n\n      /* find delvk and delxk ( i cross j ) */\n\n      ax = dyi*dzj - dzi*dyj ;\n      ay = dzi*dxj - dxi*dzj ;\n      az = dxi*dyj - dyi*dxj ;\n\n      mesh.delx_zeta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;\n\n      ax *= norm ;\n      ay *= norm ;\n      az *= norm ;\n\n      dxv = Real_t(0.25)*(SUM4(xv4,xv5,xv6,xv7) - SUM4(xv0,xv1,xv2,xv3)) ;\n      dyv = Real_t(0.25)*(SUM4(yv4,yv5,yv6,yv7) - SUM4(yv0,yv1,yv2,yv3)) ;\n      dzv = Real_t(0.25)*(SUM4(zv4,zv5,zv6,zv7) - SUM4(zv0,zv1,zv2,zv3)) ;\n\n      mesh.delv_zeta(i) = ax*dxv + ay*dyv + az*dzv ;\n\n      /* find delxi and delvi ( j cross k ) */\n\n      ax = dyj*dzk - dzj*dyk ;\n      ay = dzj*dxk - dxj*dzk ;\n      az = dxj*dyk - dyj*dxk ;\n\n      mesh.delx_xi(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;\n\n      ax *= norm ;\n      ay *= norm ;\n      az *= norm ;\n\n      dxv = Real_t(0.25)*(SUM4(xv1,xv2,xv6,xv5) - SUM4(xv0,xv3,xv7,xv4)) ;\n      dyv = Real_t(0.25)*(SUM4(yv1,yv2,yv6,yv5) - SUM4(yv0,yv3,yv7,yv4)) ;\n      dzv = Real_t(0.25)*(SUM4(zv1,zv2,zv6,zv5) - SUM4(zv0,zv3,zv7,zv4)) ;\n\n      mesh.delv_xi(i) = ax*dxv + ay*dyv + az*dzv ;\n\n      /* find delxj and delvj ( k cross i ) */\n\n      ax = dyk*dzi - dzk*dyi ;\n      ay = dzk*dxi - dxk*dzi ;\n      az = dxk*dyi - dyk*dxi ;\n\n      mesh.delx_eta(i) = vol / SQRT(ax*ax + ay*ay + az*az + ptiny) ;\n\n      ax *= norm ;\n      ay *= norm ;\n      az *= norm ;\n\n      dxv = Real_t(-0.25)*(SUM4(xv0,xv1,xv5,xv4) - SUM4(xv3,xv2,xv6,xv7)) ;\n      dyv = Real_t(-0.25)*(SUM4(yv0,yv1,yv5,yv4) - SUM4(yv3,yv2,yv6,yv7)) ;\n      dzv = Real_t(-0.25)*(SUM4(zv0,zv1,zv5,zv4) - SUM4(zv3,zv2,zv6,zv7)) ;\n\n      mesh.delv_eta(i) = ax*dxv + ay*dyv + az*dzv ;\n   }\n#undef SUM4\n}\n\n//static inline\n __attribute__((noinline)) void CalcMonotonicQRegionForElems(// parameters\n                          Real_t qlc_monoq,\n                          Real_t qqc_monoq,\n                          Real_t monoq_limiter_mult,\n                          Real_t monoq_max_slope,\n                          Real_t ptiny,\n\n                          // the elementset length\n                          Index_t elength )\n{\n   for ( Index_t ielem = 0 ; ielem < elength; ++ielem ) {\n      Real_t qlin, qquad ;\n      Real_t phixi, phieta, phizeta ;\n      Index_t i = mesh.matElemlist(ielem);\n      Int_t bcMask = mesh.elemBC(i) ;\n      Real_t delvm, delvp ;\n\n      /*  phixi     */\n      Real_t norm = Real_t(1.) / ( mesh.delv_xi(i) + ptiny ) ;\n\n      switch (bcMask & XI_M) {\n         case 0:         delvm = mesh.delv_xi(mesh.lxim(i)) ; break ;\n         case XI_M_SYMM: delvm = mesh.delv_xi(i) ;            break ;\n         case XI_M_FREE: delvm = Real_t(0.0) ;                break ;\n         default:        /* ERROR */ ;                        break ;\n      }\n      switch (bcMask & XI_P) {\n         case 0:         delvp = mesh.delv_xi(mesh.lxip(i)) ; break ;\n         case XI_P_SYMM: delvp = mesh.delv_xi(i) ;            break ;\n         case XI_P_FREE: delvp = Real_t(0.0) ;                break ;\n         default:        /* ERROR */ ;                        break ;\n      }\n\n      delvm = delvm * norm ;\n      delvp = delvp * norm ;\n\n      phixi = Real_t(.5) * ( delvm + delvp ) ;\n\n      delvm *= monoq_limiter_mult ;\n      delvp *= monoq_limiter_mult ;\n\n      if ( delvm < phixi ) phixi = delvm ;\n      if ( delvp < phixi ) phixi = delvp ;\n      if ( phixi < Real_t(0.)) phixi = Real_t(0.) ;\n      if ( phixi > monoq_max_slope) phixi = monoq_max_slope;\n\n\n      /*  phieta     */\n      norm = Real_t(1.) / ( mesh.delv_eta(i) + ptiny ) ;\n\n      switch (bcMask & ETA_M) {\n         case 0:          delvm = mesh.delv_eta(mesh.letam(i)) ; break ;\n         case ETA_M_SYMM: delvm = mesh.delv_eta(i) ;             break ;\n         case ETA_M_FREE: delvm = Real_t(0.0) ;                  break ;\n         default:         /* ERROR */ ;                          break ;\n      }\n      switch (bcMask & ETA_P) {\n         case 0:          delvp = mesh.delv_eta(mesh.letap(i)) ; break ;\n         case ETA_P_SYMM: delvp = mesh.delv_eta(i) ;             break ;\n         case ETA_P_FREE: delvp = Real_t(0.0) ;                  break ;\n         default:         /* ERROR */ ;                          break ;\n      }\n\n      delvm = delvm * norm ;\n      delvp = delvp * norm ;\n\n      phieta = Real_t(.5) * ( delvm + delvp ) ;\n\n      delvm *= monoq_limiter_mult ;\n      delvp *= monoq_limiter_mult ;\n\n      if ( delvm  < phieta ) phieta = delvm ;\n      if ( delvp  < phieta ) phieta = delvp ;\n      if ( phieta < Real_t(0.)) phieta = Real_t(0.) ;\n      if ( phieta > monoq_max_slope)  phieta = monoq_max_slope;\n\n      /*  phizeta     */\n      norm = Real_t(1.) / ( mesh.delv_zeta(i) + ptiny ) ;\n\n      switch (bcMask & ZETA_M) {\n         case 0:           delvm = mesh.delv_zeta(mesh.lzetam(i)) ; break ;\n         case ZETA_M_SYMM: delvm = mesh.delv_zeta(i) ;              break ;\n         case ZETA_M_FREE: delvm = Real_t(0.0) ;                    break ;\n         default:          /* ERROR */ ;                            break ;\n      }\n      switch (bcMask & ZETA_P) {\n         case 0:           delvp = mesh.delv_zeta(mesh.lzetap(i)) ; break ;\n         case ZETA_P_SYMM: delvp = mesh.delv_zeta(i) ;              break ;\n         case ZETA_P_FREE: delvp = Real_t(0.0) ;                    break ;\n         default:          /* ERROR */ ;                            break ;\n      }\n\n      delvm = delvm * norm ;\n      delvp = delvp * norm ;\n\n      phizeta = Real_t(.5) * ( delvm + delvp ) ;\n\n      delvm *= monoq_limiter_mult ;\n      delvp *= monoq_limiter_mult ;\n\n      if ( delvm   < phizeta ) phizeta = delvm ;\n      if ( delvp   < phizeta ) phizeta = delvp ;\n      if ( phizeta < Real_t(0.)) phizeta = Real_t(0.);\n      if ( phizeta > monoq_max_slope  ) phizeta = monoq_max_slope;\n\n      /* Remove length scale */\n\n      if ( mesh.vdov(i) > Real_t(0.) )  {\n         qlin  = Real_t(0.) ;\n         qquad = Real_t(0.) ;\n      }\n      else {\n         Real_t delvxxi   = mesh.delv_xi(i)   * mesh.delx_xi(i)   ;\n         Real_t delvxeta  = mesh.delv_eta(i)  * mesh.delx_eta(i)  ;\n         Real_t delvxzeta = mesh.delv_zeta(i) * mesh.delx_zeta(i) ;\n\n         if ( delvxxi   > Real_t(0.) ) delvxxi   = Real_t(0.) ;\n         if ( delvxeta  > Real_t(0.) ) delvxeta  = Real_t(0.) ;\n         if ( delvxzeta > Real_t(0.) ) delvxzeta = Real_t(0.) ;\n\n         Real_t rho = mesh.elemMass(i) / (mesh.volo(i) * mesh.vnew(i)) ;\n\n         qlin = -qlc_monoq * rho *\n            (  delvxxi   * (Real_t(1.) - phixi) +\n               delvxeta  * (Real_t(1.) - phieta) +\n               delvxzeta * (Real_t(1.) - phizeta)  ) ;\n\n         qquad = qqc_monoq * rho *\n            (  delvxxi*delvxxi     * (Real_t(1.) - phixi*phixi) +\n               delvxeta*delvxeta   * (Real_t(1.) - phieta*phieta) +\n               delvxzeta*delvxzeta * (Real_t(1.) - phizeta*phizeta)  ) ;\n      }\n\n      mesh.qq(i) = qquad ;\n      mesh.ql(i) = qlin  ;\n   }\n}\n\n//static inline\n __attribute__((noinline)) void CalcMonotonicQForElems()\n{  \n   //\n   // initialize parameters\n   // \n   const Real_t ptiny        = Real_t(1.e-36) ;\n   Real_t monoq_max_slope    = mesh.monoq_max_slope() ;\n   Real_t monoq_limiter_mult = mesh.monoq_limiter_mult() ;\n\n   //\n   // calculate the monotonic q for pure regions\n   //\n   Index_t elength = mesh.numElem() ;\n   if (elength > 0) {\n      Real_t qlc_monoq = mesh.qlc_monoq();\n      Real_t qqc_monoq = mesh.qqc_monoq();\n      CalcMonotonicQRegionForElems(// parameters\n                           qlc_monoq,\n                           qqc_monoq,\n                           monoq_limiter_mult,\n                           monoq_max_slope,\n                           ptiny,\n\n                           // the elemset length\n                           elength );\n   }\n}\n\n//static inline\n __attribute__((noinline)) void CalcQForElems()\n{\n   Real_t qstop = mesh.qstop() ;\n   Index_t numElem = mesh.numElem() ;\n\n   //\n   // MONOTONIC Q option\n   //\n\n   /* Calculate velocity gradients */\n   CalcMonotonicQGradientsForElems() ;\n\n   /* Transfer veloctiy gradients in the first order elements */\n   /* problem->commElements->Transfer(CommElements::monoQ) ; */\n   CalcMonotonicQForElems() ;\n\n   /* Don't allow excessive artificial viscosity */\n   if (numElem != 0) {\n      Index_t idx = -1; \n      for (Index_t i=0; i<numElem; ++i) {\n         if ( mesh.q(i) > qstop ) {\n            idx = i ;\n            break ;\n         }\n      }\n\n      if(idx >= 0) {\n         exit(QStopError) ;\n      }\n   }\n}\n\n//static inline\n __attribute__((noinline)) void CalcPressureForElems(Real_t* p_new, Real_t* bvc,\n                          Real_t* pbvc, Real_t* e_old,\n                          Real_t* compression, Real_t *vnewc,\n                          Real_t pmin,\n                          Real_t p_cut, Real_t eosvmax,\n                          Index_t length)\n{\n   Real_t c1s = Real_t(2.0)/Real_t(3.0) ;\n   for (Index_t i = 0; i < length ; ++i) {\n      bvc[i] = c1s * (compression[i] + Real_t(1.));\n      pbvc[i] = c1s;\n   }\n\n   for (Index_t i = 0 ; i < length ; ++i){\n      p_new[i] = bvc[i] * e_old[i] ;\n\n      if    (FABS(p_new[i]) <  p_cut   )\n         p_new[i] = Real_t(0.0) ;\n\n      if    ( vnewc[i] >= eosvmax ) /* impossible condition here? */\n         p_new[i] = Real_t(0.0) ;\n\n      if    (p_new[i]       <  pmin)\n         p_new[i]   = pmin ;\n   }\n}\n\n//static inline\n __attribute__((noinline)) void CalcEnergyForElems(Real_t* p_new, Real_t* e_new, Real_t* q_new,\n                        Real_t* bvc, Real_t* pbvc,\n                        Real_t* p_old, Real_t* e_old, Real_t* q_old,\n                        Real_t* compression, Real_t* compHalfStep,\n                        Real_t* vnewc, Real_t* work, Real_t* delvc, Real_t pmin,\n                        Real_t p_cut, Real_t  e_cut, Real_t q_cut, Real_t emin,\n                        Real_t* qq, Real_t* ql,\n                        Real_t rho0,\n                        Real_t eosvmax,\n                        Index_t length)\n{\n   const Real_t sixth = Real_t(1.0) / Real_t(6.0) ;\n   Real_t *pHalfStep = Allocate<Real_t>(length) ;\n\n   for (Index_t i = 0 ; i < length ; ++i) {\n      e_new[i] = e_old[i] - Real_t(0.5) * delvc[i] * (p_old[i] + q_old[i])\n         + Real_t(0.5) * work[i];\n\n      if (e_new[i]  < emin ) {\n         e_new[i] = emin ;\n      }\n   }\n\n   CalcPressureForElems(pHalfStep, bvc, pbvc, e_new, compHalfStep, vnewc,\n                   pmin, p_cut, eosvmax, length);\n\n   for (Index_t i = 0 ; i < length ; ++i) {\n      Real_t vhalf = Real_t(1.) / (Real_t(1.) + compHalfStep[i]) ;\n\n      if ( delvc[i] > Real_t(0.) ) {\n         q_new[i] /* = qq[i] = ql[i] */ = Real_t(0.) ;\n      }\n      else {\n         Real_t ssc = ( pbvc[i] * e_new[i]\n                 + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;\n\n         if ( ssc <= Real_t(.111111e-36) ) {\n            ssc =Real_t(.333333e-18) ;\n         } else {\n            ssc = SQRT(ssc) ;\n         }\n\n         q_new[i] = (ssc*ql[i] + qq[i]) ;\n      }\n\n      e_new[i] = e_new[i] + Real_t(0.5) * delvc[i]\n         * (  Real_t(3.0)*(p_old[i]     + q_old[i])\n              - Real_t(4.0)*(pHalfStep[i] + q_new[i])) ;\n   }\n   for (Index_t i = 0 ; i < length ; ++i) {\n\n      e_new[i] += Real_t(0.5) * work[i];\n\n      if (FABS(e_new[i]) < e_cut) {\n         e_new[i] = Real_t(0.)  ;\n      }\n      if (     e_new[i]  < emin ) {\n         e_new[i] = emin ;\n      }\n   }\n\n   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,\n                   pmin, p_cut, eosvmax, length);\n\n   for (Index_t i = 0 ; i < length ; ++i){\n      Real_t q_tilde ;\n\n      if (delvc[i] > Real_t(0.)) {\n         q_tilde = Real_t(0.) ;\n      }\n      else {\n         Real_t ssc = ( pbvc[i] * e_new[i]\n                 + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;\n\n         if ( ssc <= Real_t(.111111e-36) ) {\n            ssc = Real_t(.333333e-18) ;\n         } else {\n            ssc = SQRT(ssc) ;\n         }\n\n         q_tilde = (ssc*ql[i] + qq[i]) ;\n      }\n\n      e_new[i] = e_new[i] - (  Real_t(7.0)*(p_old[i]     + q_old[i])\n                               - Real_t(8.0)*(pHalfStep[i] + q_new[i])\n                               + (p_new[i] + q_tilde)) * delvc[i]*sixth ;\n\n      if (FABS(e_new[i]) < e_cut) {\n         e_new[i] = Real_t(0.)  ;\n      }\n      if (     e_new[i]  < emin ) {\n         e_new[i] = emin ;\n      }\n   }\n\n   CalcPressureForElems(p_new, bvc, pbvc, e_new, compression, vnewc,\n                   pmin, p_cut, eosvmax, length);\n\n   for (Index_t i = 0 ; i < length ; ++i){\n\n      if ( delvc[i] <= Real_t(0.) ) {\n         Real_t ssc = ( pbvc[i] * e_new[i]\n                 + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;\n\n         if ( ssc <= Real_t(.111111e-36) ) {\n            ssc = Real_t(.333333e-18) ;\n         } else {\n            ssc = SQRT(ssc) ;\n         }\n\n         q_new[i] = (ssc*ql[i] + qq[i]) ;\n\n         if (FABS(q_new[i]) < q_cut) q_new[i] = Real_t(0.) ;\n      }\n   }\n\n   Release(&pHalfStep) ;\n\n   return ;\n}\n\n//static inline\n __attribute__((noinline)) void CalcSoundSpeedForElems(Real_t *vnewc, Real_t rho0, Real_t *enewc,\n                            Real_t *pnewc, Real_t *pbvc,\n                            Real_t *bvc, Real_t ss4o3, Index_t nz)\n{\n   for (Index_t i = 0; i < nz ; ++i) {\n      Index_t iz = mesh.matElemlist(i);\n      Real_t ssTmp = (pbvc[i] * enewc[i] + vnewc[i] * vnewc[i] *\n                 bvc[i] * pnewc[i]) / rho0;\n      if (ssTmp <= Real_t(.111111e-36)) {\n         ssTmp = Real_t(.111111e-36);\n      }\n      mesh.ss(iz) = SQRT(ssTmp);\n   }\n}\n\n//static inline\n __attribute__((noinline)) void EvalEOSForElems(Real_t *vnewc, Index_t length)\n{\n   Real_t  e_cut = mesh.e_cut();\n   Real_t  p_cut = mesh.p_cut();\n   Real_t  ss4o3 = mesh.ss4o3();\n   Real_t  q_cut = mesh.q_cut();\n\n   Real_t eosvmax = mesh.eosvmax() ;\n   Real_t eosvmin = mesh.eosvmin() ;\n   Real_t pmin    = mesh.pmin() ;\n   Real_t emin    = mesh.emin() ;\n   Real_t rho0    = mesh.refdens() ;\n\n   Real_t *e_old = Allocate<Real_t>(length) ;\n   Real_t *delvc = Allocate<Real_t>(length) ;\n   Real_t *p_old = Allocate<Real_t>(length) ;\n   Real_t *q_old = Allocate<Real_t>(length) ;\n   Real_t *compression = Allocate<Real_t>(length) ;\n   Real_t *compHalfStep = Allocate<Real_t>(length) ;\n   Real_t *qq = Allocate<Real_t>(length) ;\n   Real_t *ql = Allocate<Real_t>(length) ;\n   Real_t *work = Allocate<Real_t>(length) ;\n   Real_t *p_new = Allocate<Real_t>(length) ;\n   Real_t *e_new = Allocate<Real_t>(length) ;\n   Real_t *q_new = Allocate<Real_t>(length) ;\n   Real_t *bvc = Allocate<Real_t>(length) ;\n   Real_t *pbvc = Allocate<Real_t>(length) ;\n\n   /* compress data, minimal set */\n   for (Index_t i=0; i<length; ++i) {\n      Index_t zidx = mesh.matElemlist(i) ;\n      e_old[i] = mesh.e(zidx) ;\n   }\n\n   for (Index_t i=0; i<length; ++i) {\n      Index_t zidx = mesh.matElemlist(i) ;\n      delvc[i] = mesh.delv(zidx) ;\n   }\n\n   for (Index_t i=0; i<length; ++i) {\n      Index_t zidx = mesh.matElemlist(i) ;\n      p_old[i] = mesh.p(zidx) ;\n   }\n\n   for (Index_t i=0; i<length; ++i) {\n      Index_t zidx = mesh.matElemlist(i) ;\n      q_old[i] = mesh.q(zidx) ;\n   }\n\n   for (Index_t i = 0; i < length ; ++i) {\n      Real_t vchalf ;\n      compression[i] = Real_t(1.) / vnewc[i] - Real_t(1.);\n      vchalf = vnewc[i] - delvc[i] * Real_t(.5);\n      compHalfStep[i] = Real_t(1.) / vchalf - Real_t(1.);\n   }\n\n   /* Check for v > eosvmax or v < eosvmin */\n   if ( eosvmin != Real_t(0.) ) {\n      for(Index_t i=0 ; i<length ; ++i) {\n         if (vnewc[i] <= eosvmin) { /* impossible due to calling func? */\n            compHalfStep[i] = compression[i] ;\n         }\n      }\n   }\n   if ( eosvmax != Real_t(0.) ) {\n      for(Index_t i=0 ; i<length ; ++i) {\n         if (vnewc[i] >= eosvmax) { /* impossible due to calling func? */\n            p_old[i]        = Real_t(0.) ;\n            compression[i]  = Real_t(0.) ;\n            compHalfStep[i] = Real_t(0.) ;\n         }\n      }\n   }\n\n   for (Index_t i = 0 ; i < length ; ++i) {\n      Index_t zidx = mesh.matElemlist(i) ;\n      qq[i] = mesh.qq(zidx) ;\n      ql[i] = mesh.ql(zidx) ;\n      work[i] = Real_t(0.) ; \n   }\n\n   CalcEnergyForElems(p_new, e_new, q_new, bvc, pbvc,\n                 p_old, e_old,  q_old, compression, compHalfStep,\n                 vnewc, work,  delvc, pmin,\n                 p_cut, e_cut, q_cut, emin,\n                 qq, ql, rho0, eosvmax, length);\n\n\n   for (Index_t i=0; i<length; ++i) {\n      Index_t zidx = mesh.matElemlist(i) ;\n      mesh.p(zidx) = p_new[i] ;\n   }\n\n   for (Index_t i=0; i<length; ++i) {\n      Index_t zidx = mesh.matElemlist(i) ;\n      mesh.e(zidx) = e_new[i] ;\n   }\n\n   for (Index_t i=0; i<length; ++i) {\n      Index_t zidx = mesh.matElemlist(i) ;\n      mesh.q(zidx) = q_new[i] ;\n   }\n\n   CalcSoundSpeedForElems(vnewc, rho0, e_new, p_new,\n             pbvc, bvc, ss4o3, length) ;\n\n   Release(&pbvc) ;\n   Release(&bvc) ;\n   Release(&q_new) ;\n   Release(&e_new) ;\n   Release(&p_new) ;\n   Release(&work) ;\n   Release(&ql) ;\n   Release(&qq) ;\n   Release(&compHalfStep) ;\n   Release(&compression) ;\n   Release(&q_old) ;\n   Release(&p_old) ;\n   Release(&delvc) ;\n   Release(&e_old) ;\n}\n\n//static inline\n __attribute__((noinline)) void ApplyMaterialPropertiesForElems()\n{\n  Index_t length = mesh.numElem() ;\n\n  if (length != 0) {\n    /* Expose all of the variables needed for material evaluation */\n    Real_t eosvmin = mesh.eosvmin() ;\n    Real_t eosvmax = mesh.eosvmax() ;\n    Real_t *vnewc = Allocate<Real_t>(length) ;\n\n    for (Index_t i=0 ; i<length ; ++i) {\n       Index_t zn = mesh.matElemlist(i) ;\n       vnewc[i] = mesh.vnew(zn) ;\n    }\n\n    if (eosvmin != Real_t(0.)) {\n       for(Index_t i=0 ; i<length ; ++i) {\n          if (vnewc[i] < eosvmin)\n             vnewc[i] = eosvmin ;\n       }\n    }\n\n    if (eosvmax != Real_t(0.)) {\n       for(Index_t i=0 ; i<length ; ++i) {\n          if (vnewc[i] > eosvmax)\n             vnewc[i] = eosvmax ;\n       }\n    }\n\n    for (Index_t i=0; i<length; ++i) {\n       Index_t zn = mesh.matElemlist(i) ;\n       Real_t vc = mesh.v(zn) ;\n       if (eosvmin != Real_t(0.)) {\n          if (vc < eosvmin)\n             vc = eosvmin ;\n       }\n       if (eosvmax != Real_t(0.)) {\n          if (vc > eosvmax)\n             vc = eosvmax ;\n       }\n       if (vc <= 0.) {\n          exit(VolumeError) ;\n       }\n    }\n\n    EvalEOSForElems(vnewc, length);\n\n    Release(&vnewc) ;\n\n  }\n}\n\n//static inline\n __attribute__((noinline)) void UpdateVolumesForElems()\n{\n   Index_t numElem = mesh.numElem();\n   if (numElem != 0) {\n      Real_t v_cut = mesh.v_cut();\n\n      for(Index_t i=0 ; i<numElem ; ++i) {\n         Real_t tmpV ;\n         tmpV = mesh.vnew(i) ;\n\n         if ( FABS(tmpV - Real_t(1.0)) < v_cut )\n            tmpV = Real_t(1.0) ;\n         mesh.v(i) = tmpV ;\n      }\n   }\n\n   return ;\n}\n\n//static inline\n __attribute__((noinline)) void LagrangeElements()\n{\n  const Real_t deltatime = mesh.deltatime() ;\n\n  CalcLagrangeElements(deltatime) ;\n\n  /* Calculate Q.  (Monotonic q option requires communication) */\n  CalcQForElems() ;\n\n  ApplyMaterialPropertiesForElems() ;\n\n  UpdateVolumesForElems() ;\n}\n\n//static inline\n __attribute__((noinline)) void CalcCourantConstraintForElems()\n{\n   Real_t dtcourant = Real_t(1.0e+20) ;\n   Index_t   courant_elem = -1 ;\n   Real_t      qqc = mesh.qqc() ;\n   Index_t length = mesh.numElem() ;\n\n   Real_t  qqc2 = Real_t(64.0) * qqc * qqc ;\n\n   for (Index_t i = 0 ; i < length ; ++i) {\n      Index_t indx = mesh.matElemlist(i) ;\n\n      Real_t dtf = mesh.ss(indx) * mesh.ss(indx) ;\n\n      if ( mesh.vdov(indx) < Real_t(0.) ) {\n\n         dtf = dtf\n            + qqc2 * mesh.arealg(indx) * mesh.arealg(indx)\n            * mesh.vdov(indx) * mesh.vdov(indx) ;\n      }\n\n      dtf = SQRT(dtf) ;\n\n      dtf = mesh.arealg(indx) / dtf ;\n\n   /* determine minimum timestep with its corresponding elem */\n      if (mesh.vdov(indx) != Real_t(0.)) {\n         if ( dtf < dtcourant ) {\n            dtcourant = dtf ;\n            courant_elem = indx ;\n         }\n      }\n   }\n\n   /* Don't try to register a time constraint if none of the elements\n    * were active */\n   if (courant_elem != -1) {\n      mesh.dtcourant() = dtcourant ;\n   }\n\n   return ;\n}\n\n//static inline\n __attribute__((noinline)) void CalcHydroConstraintForElems()\n{\n   Real_t dthydro = Real_t(1.0e+20) ;\n   Index_t hydro_elem = -1 ;\n   Real_t dvovmax = mesh.dvovmax() ;\n   Index_t length = mesh.numElem() ;\n\n   for (Index_t i = 0 ; i < length ; ++i) {\n      Index_t indx = mesh.matElemlist(i) ;\n\n      if (mesh.vdov(indx) != Real_t(0.)) {\n         Real_t dtdvov = dvovmax / (FABS(mesh.vdov(indx))+Real_t(1.e-20)) ;\n         if ( dthydro > dtdvov ) {\n            dthydro = dtdvov ;\n            hydro_elem = indx ;\n         }\n      }\n   }\n\n   if (hydro_elem != -1) {\n      mesh.dthydro() = dthydro ;\n   }\n\n   return ;\n}\n\n//static inline\n __attribute__((noinline)) void CalcTimeConstraintsForElems() {\n   /* evaluate time constraint */\n   CalcCourantConstraintForElems() ;\n\n   /* check hydro constraint */\n   CalcHydroConstraintForElems() ;\n}\n\n//static inline\n __attribute__((noinline)) void LagrangeLeapFrog()\n{\n   /* calculate nodal forces, accelerations, velocities, positions, with\n    * applied boundary conditions and slide surface considerations */\n   LagrangeNodal();\n\n   /* calculate element quantities (i.e. velocity gradient & q), and update\n    * material states */\n   LagrangeElements();\n\n   CalcTimeConstraintsForElems();\n\n   // LagrangeRelease() ;  Creation/destruction of temps may be important to capture \n}\n\n __attribute__((noinline)) int main(int argc, char *argv[])\n{\n   Index_t edgeElems = atoi(argv[1]);\n   Index_t edgeNodes = edgeElems+1 ;\n   // Real_t ds = Real_t(1.125)/Real_t(edgeElems) ; /* may accumulate roundoff */\n   Real_t tx, ty, tz ;\n   Index_t nidx, zidx ;\n   Index_t meshElems ;\n\n   /* get run options to measure various metrics */\n\n   /* ... */\n\n   /****************************/\n   /*   Initialize Sedov Mesh  */\n   /****************************/\n\n   /* construct a uniform box for this processor */\n\n   mesh.sizeX()   = edgeElems ;\n   mesh.sizeY()   = edgeElems ;\n   mesh.sizeZ()   = edgeElems ;\n   mesh.numElem() = edgeElems*edgeElems*edgeElems ;\n   mesh.numNode() = edgeNodes*edgeNodes*edgeNodes ;\n\n   meshElems = mesh.numElem() ;\n\n\n   /* allocate field memory */\n\n   mesh.AllocateElemPersistent(mesh.numElem()) ;\n   mesh.AllocateElemTemporary (mesh.numElem()) ;\n\n   mesh.AllocateNodalPersistent(mesh.numNode()) ;\n   mesh.AllocateNodesets(edgeNodes*edgeNodes) ;\n\n\n   /* initialize nodal coordinates */\n\n   nidx = 0 ;\n   tz  = Real_t(0.) ;\n   for (Index_t plane=0; plane<edgeNodes; ++plane) {\n      ty = Real_t(0.) ;\n      for (Index_t row=0; row<edgeNodes; ++row) {\n         tx = Real_t(0.) ;\n         for (Index_t col=0; col<edgeNodes; ++col) {\n            mesh.x(nidx) = tx ;\n            mesh.y(nidx) = ty ;\n            mesh.z(nidx) = tz ;\n            ++nidx ;\n            // tx += ds ; /* may accumulate roundoff... */\n            tx = Real_t(1.125)*Real_t(col+1)/Real_t(edgeElems) ;\n         }\n         // ty += ds ;  /* may accumulate roundoff... */\n         ty = Real_t(1.125)*Real_t(row+1)/Real_t(edgeElems) ;\n      }\n      // tz += ds ;  /* may accumulate roundoff... */\n      tz = Real_t(1.125)*Real_t(plane+1)/Real_t(edgeElems) ;\n   }\n\n\n   /* embed hexehedral elements in nodal point lattice */\n\n   nidx = 0 ;\n   zidx = 0 ;\n   for (Index_t plane=0; plane<edgeElems; ++plane) {\n      for (Index_t row=0; row<edgeElems; ++row) {\n         for (Index_t col=0; col<edgeElems; ++col) {\n            Index_t *localNode = mesh.nodelist(zidx) ;\n            localNode[0] = nidx                                       ;\n            localNode[1] = nidx                                   + 1 ;\n            localNode[2] = nidx                       + edgeNodes + 1 ;\n            localNode[3] = nidx                       + edgeNodes     ;\n            localNode[4] = nidx + edgeNodes*edgeNodes                 ;\n            localNode[5] = nidx + edgeNodes*edgeNodes             + 1 ;\n            localNode[6] = nidx + edgeNodes*edgeNodes + edgeNodes + 1 ;\n            localNode[7] = nidx + edgeNodes*edgeNodes + edgeNodes     ;\n            ++zidx ;\n            ++nidx ;\n         }\n         ++nidx ;\n      }\n      nidx += edgeNodes ;\n   }\n\n   /* Create a material IndexSet (entire mesh same material for now) */\n   for (Index_t i=0; i<meshElems; ++i) {\n      mesh.matElemlist(i) = i ;\n   }\n   \n   /* initialize material parameters */\n   mesh.dtfixed() = Real_t(-1.0e-7) ;\n   mesh.deltatime() = Real_t(1.0e-7) ;\n   mesh.deltatimemultlb() = Real_t(1.1) ;\n   mesh.deltatimemultub() = Real_t(1.2) ;\n   mesh.stoptime()  = Real_t(1.0e-2) ;\n   mesh.dtcourant() = Real_t(1.0e+20) ;\n   mesh.dthydro()   = Real_t(1.0e+20) ;\n   mesh.dtmax()     = Real_t(1.0e-2) ;\n   mesh.time()    = Real_t(0.) ;\n   mesh.cycle()   = 0 ;\n\n   mesh.e_cut() = Real_t(1.0e-7) ;\n   mesh.p_cut() = Real_t(1.0e-7) ;\n   mesh.q_cut() = Real_t(1.0e-7) ;\n   mesh.u_cut() = Real_t(1.0e-7) ;\n   mesh.v_cut() = Real_t(1.0e-10) ;\n\n   mesh.hgcoef()      = Real_t(3.0) ;\n   mesh.ss4o3()       = Real_t(4.0)/Real_t(3.0) ;\n\n   mesh.qstop()              =  Real_t(1.0e+12) ;\n   mesh.monoq_max_slope()    =  Real_t(1.0) ;\n   mesh.monoq_limiter_mult() =  Real_t(2.0) ;\n   mesh.qlc_monoq()          = Real_t(0.5) ;\n   mesh.qqc_monoq()          = Real_t(2.0)/Real_t(3.0) ;\n   mesh.qqc()                = Real_t(2.0) ;\n\n   mesh.pmin() =  Real_t(0.) ;\n   mesh.emin() = Real_t(-1.0e+15) ;\n\n   mesh.dvovmax() =  Real_t(0.1) ;\n\n   mesh.eosvmax() =  Real_t(1.0e+9) ;\n   mesh.eosvmin() =  Real_t(1.0e-9) ;\n\n   mesh.refdens() =  Real_t(1.0) ;\n\n   /* initialize field data */\n   for (Index_t i=0; i<meshElems; ++i) {\n      Real_t x_local[8], y_local[8], z_local[8] ;\n      Index_t *elemToNode = mesh.nodelist(i) ;\n      for( Index_t lnode=0 ; lnode<8 ; ++lnode )\n      {\n        Index_t gnode = elemToNode[lnode];\n        x_local[lnode] = mesh.x(gnode);\n        y_local[lnode] = mesh.y(gnode);\n        z_local[lnode] = mesh.z(gnode);\n      }\n\n      // volume calculations\n      Real_t volume = CalcElemVolume(x_local, y_local, z_local );\n      mesh.volo(i) = volume ;\n      mesh.elemMass(i) = volume ;\n      for (Index_t j=0; j<8; ++j) {\n         Index_t idx = elemToNode[j] ;\n         mesh.nodalMass(idx) += volume / Real_t(8.0) ;\n      }\n   }\n\n   /* deposit energy */\n   mesh.e(0) = Real_t(3.948746e+7) ;\n\n   /* set up symmetry nodesets */\n   nidx = 0 ;\n   for (Index_t i=0; i<edgeNodes; ++i) {\n      Index_t planeInc = i*edgeNodes*edgeNodes ;\n      Index_t rowInc   = i*edgeNodes ;\n      for (Index_t j=0; j<edgeNodes; ++j) {\n         mesh.symmX(nidx) = planeInc + j*edgeNodes ;\n         mesh.symmY(nidx) = planeInc + j ;\n         mesh.symmZ(nidx) = rowInc   + j ;\n         ++nidx ;\n      }\n   }\n\n   /* set up elemement connectivity information */\n   mesh.lxim(0) = 0 ;\n   for (Index_t i=1; i<meshElems; ++i) {\n      mesh.lxim(i)   = i-1 ;\n      mesh.lxip(i-1) = i ;\n   }\n   mesh.lxip(meshElems-1) = meshElems-1 ;\n\n   for (Index_t i=0; i<edgeElems; ++i) {\n      mesh.letam(i) = i ; \n      mesh.letap(meshElems-edgeElems+i) = meshElems-edgeElems+i ;\n   }\n   for (Index_t i=edgeElems; i<meshElems; ++i) {\n      mesh.letam(i) = i-edgeElems ;\n      mesh.letap(i-edgeElems) = i ;\n   }\n\n   for (Index_t i=0; i<edgeElems*edgeElems; ++i) {\n      mesh.lzetam(i) = i ;\n      mesh.lzetap(meshElems-edgeElems*edgeElems+i) = meshElems-edgeElems*edgeElems+i ;\n   }\n   for (Index_t i=edgeElems*edgeElems; i<meshElems; ++i) {\n      mesh.lzetam(i) = i - edgeElems*edgeElems ;\n      mesh.lzetap(i-edgeElems*edgeElems) = i ;\n   }\n\n   /* set up boundary condition information */\n   for (Index_t i=0; i<meshElems; ++i) {\n      mesh.elemBC(i) = 0 ;  /* clear BCs by default */\n   }\n\n   /* faces on \"external\" boundaries will be */\n   /* symmetry plane or free surface BCs */\n   for (Index_t i=0; i<edgeElems; ++i) {\n      Index_t planeInc = i*edgeElems*edgeElems ;\n      Index_t rowInc   = i*edgeElems ;\n      for (Index_t j=0; j<edgeElems; ++j) {\n         mesh.elemBC(planeInc+j*edgeElems) |= XI_M_SYMM ;\n         mesh.elemBC(planeInc+j*edgeElems+edgeElems-1) |= XI_P_FREE ;\n         mesh.elemBC(planeInc+j) |= ETA_M_SYMM ;\n         mesh.elemBC(planeInc+j+edgeElems*edgeElems-edgeElems) |= ETA_P_FREE ;\n         mesh.elemBC(rowInc+j) |= ZETA_M_SYMM ;\n         mesh.elemBC(rowInc+j+meshElems-edgeElems*edgeElems) |= ZETA_P_FREE ;\n      }\n   }\n\n   timeval start, end;\n   gettimeofday(&start, NULL);\n\n\n   /* timestep to solution */\n   while(mesh.time() < mesh.stoptime() ) {\n      TimeIncrement() ;\n      LagrangeLeapFrog() ;\n      /* problem->commNodes->Transfer(CommNodes::syncposvel) ; */\n#if LULESH_SHOW_PROGRESS\n      printf(\"time = %e, dt=%e\\n\",\n             double(mesh.time()), double(mesh.deltatime()) ) ;\n#endif\n   }\n\n   \tgettimeofday(&end, NULL);\n        double elapsed_time = double(end.tv_sec - start.tv_sec) + double(end.tv_usec - start.tv_usec) *1e-6;\n\n\n        printf(\"\\n\\nElapsed time = %12.6e\\n\\n\", elapsed_time);\n\n        Index_t ElemId = 0;\n        printf(\"Run completed:  \\n\");\n        printf(\"   Problem size        =  %i \\n\",    edgeElems);\n        printf(\"   Iteration count     =  %i \\n\",    mesh.cycle());\n        printf(\"   Final Origin Energy = %12.6e \\n\", mesh.e(ElemId));\n\n        Real_t   MaxAbsDiff = Real_t(0.0);\n        Real_t TotalAbsDiff = Real_t(0.0);\n        Real_t   MaxRelDiff = Real_t(0.0);\n\n        for (Index_t j=0; j<edgeElems; ++j) {\n                for (Index_t k=j+1; k<edgeElems; ++k) {\n                        Real_t AbsDiff = FABS(mesh.e(j*edgeElems+k) - mesh.e(k*edgeElems+j));\n                        TotalAbsDiff  += AbsDiff;\n\n                        if (MaxAbsDiff <AbsDiff) MaxAbsDiff = AbsDiff;\n\n                        Real_t RelDiff = AbsDiff / mesh.e(k*edgeElems+j);\n\n                        if (MaxRelDiff <RelDiff)  MaxRelDiff = RelDiff;\n                }\n        }\n      FILE *out_file;\n      out_file = fopen(\"element.dat\",\"w\");\n        for (Index_t i = 0; i < edgeElems; i++){\n           fprintf( out_file, \"%f \\n\" , mesh.e(i));\n        }\n        fclose(out_file);\n\n        printf(\"   Testing Plane 0 of Energy Array:\\n\");\n        printf(\"        MaxAbsDiff   = %12.6e\\n\",   MaxAbsDiff   );\n        printf(\"        TotalAbsDiff = %12.6e\\n\",   TotalAbsDiff );\n        printf(\"        MaxRelDiff   = %12.6e\\n\\n\", MaxRelDiff   );\n\n\n   //   FILE *fp = fopen(\"x.asc\",\"wb\");\n   //for (Index_t i=0; i<mesh.numElem(); i++)\n   //    fprintf(fp,\"%.6f\\n\",mesh.x(i));\n   //fclose(fp);\n               \n   return 0 ;\n}\n", "label": 2}
{"code": "#include <stdio.h>\n#include <stdlib.h>\n#include <omp.h>\n#include <sys/time.h>\n\n// Returns the current system time in microseconds \nlong long get_time()\n{\n    struct timeval tv;\n    gettimeofday(&tv, NULL);\n    return (tv.tv_sec * 1000000) + tv.tv_usec;\n\n}\n\nusing namespace std;\n\n#define BLOCK_SIZE 16\n#define BLOCK_SIZE_C BLOCK_SIZE\n#define BLOCK_SIZE_R BLOCK_SIZE\n\n#define STR_SIZE\t256\n\n/* maximum power density possible (say 300W for a 10mm x 10mm chip)\t*/\n#define MAX_PD\t(3.0e6)\n/* required precision in degrees\t*/\n#define PRECISION\t0.001\n#define SPEC_HEAT_SI 1.75e6\n#define K_SI 100\n/* capacitance fitting factor\t*/\n#define FACTOR_CHIP\t0.5\n#define OPEN\n//#define NUM_THREAD 4\n\ntypedef float FLOAT;\n\n/* chip parameters\t*/\nconst FLOAT t_chip = 0.0005;\nconst FLOAT chip_height = 0.016;\nconst FLOAT chip_width = 0.016;\n\n#ifdef OMP_OFFLOAD\n#pragma offload_attribute(push, target(mic))\n#endif\n\n/* ambient temperature, assuming no package at all\t*/\nconst FLOAT amb_temp = 80.0;\n\nint num_omp_threads;\n\n/* Single iteration of the transient solver in the grid model.\n * advances the solution of the discretized difference equations \n * by one time step\n */\nvoid single_iteration(FLOAT *result, FLOAT *temp, FLOAT *power, int row, int col,\n\t\t\t\t\t  FLOAT Cap_1, FLOAT Rx_1, FLOAT Ry_1, FLOAT Rz_1, \n\t\t\t\t\t  FLOAT step)\n{\n    FLOAT delta;\n    int r, c;\n    int chunk;\n    int num_chunk = row*col / (BLOCK_SIZE_R * BLOCK_SIZE_C);\n    int chunks_in_row = col/BLOCK_SIZE_C;\n    int chunks_in_col = row/BLOCK_SIZE_R;\n\n#ifdef OPEN\n    #ifndef __MIC__\n\tomp_set_num_threads(num_omp_threads);\n    #endif\n    #pragma omp parallel for shared(power, temp, result) private(chunk, r, c, delta) firstprivate(row, col, num_chunk, chunks_in_row) schedule(static)\n#endif\n    for ( chunk = 0; chunk < num_chunk; ++chunk )\n    {\n        int r_start = BLOCK_SIZE_R*(chunk/chunks_in_col);\n        int c_start = BLOCK_SIZE_C*(chunk%chunks_in_row); \n        int r_end = r_start + BLOCK_SIZE_R > row ? row : r_start + BLOCK_SIZE_R;\n        int c_end = c_start + BLOCK_SIZE_C > col ? col : c_start + BLOCK_SIZE_C;\n       \n        if ( r_start == 0 || c_start == 0 || r_end == row || c_end == col )\n        {\n            for ( r = r_start; r < r_start + BLOCK_SIZE_R; ++r ) {\n                for ( c = c_start; c < c_start + BLOCK_SIZE_C; ++c ) {\n                    /* Corner 1 */\n                    if ( (r == 0) && (c == 0) ) {\n                        delta = (Cap_1) * (power[0] +\n                            (temp[1] - temp[0]) * Rx_1 +\n                            (temp[col] - temp[0]) * Ry_1 +\n                            (amb_temp - temp[0]) * Rz_1);\n                    }\t/* Corner 2 */\n                    else if ((r == 0) && (c == col-1)) {\n                        delta = (Cap_1) * (power[c] +\n                            (temp[c-1] - temp[c]) * Rx_1 +\n                            (temp[c+col] - temp[c]) * Ry_1 +\n                        (   amb_temp - temp[c]) * Rz_1);\n                    }\t/* Corner 3 */\n                    else if ((r == row-1) && (c == col-1)) {\n                        delta = (Cap_1) * (power[r*col+c] + \n                            (temp[r*col+c-1] - temp[r*col+c]) * Rx_1 + \n                            (temp[(r-1)*col+c] - temp[r*col+c]) * Ry_1 + \n                        (   amb_temp - temp[r*col+c]) * Rz_1);\t\t\t\t\t\n                    }\t/* Corner 4\t*/\n                    else if ((r == row-1) && (c == 0)) {\n                        delta = (Cap_1) * (power[r*col] + \n                            (temp[r*col+1] - temp[r*col]) * Rx_1 + \n                            (temp[(r-1)*col] - temp[r*col]) * Ry_1 + \n                            (amb_temp - temp[r*col]) * Rz_1);\n                    }\t/* Edge 1 */\n                    else if (r == 0) {\n                        delta = (Cap_1) * (power[c] + \n                            (temp[c+1] + temp[c-1] - 2.0*temp[c]) * Rx_1 + \n                            (temp[col+c] - temp[c]) * Ry_1 + \n                            (amb_temp - temp[c]) * Rz_1);\n                    }\t/* Edge 2 */\n                    else if (c == col-1) {\n                        delta = (Cap_1) * (power[r*col+c] + \n                            (temp[(r+1)*col+c] + temp[(r-1)*col+c] - 2.0*temp[r*col+c]) * Ry_1 + \n                            (temp[r*col+c-1] - temp[r*col+c]) * Rx_1 + \n                            (amb_temp - temp[r*col+c]) * Rz_1);\n                    }\t/* Edge 3 */\n                    else if (r == row-1) {\n                        delta = (Cap_1) * (power[r*col+c] + \n                            (temp[r*col+c+1] + temp[r*col+c-1] - 2.0*temp[r*col+c]) * Rx_1 + \n                            (temp[(r-1)*col+c] - temp[r*col+c]) * Ry_1 + \n                            (amb_temp - temp[r*col+c]) * Rz_1);\n                    }\t/* Edge 4 */\n                    else if (c == 0) {\n                        delta = (Cap_1) * (power[r*col] + \n                            (temp[(r+1)*col] + temp[(r-1)*col] - 2.0*temp[r*col]) * Ry_1 + \n                            (temp[r*col+1] - temp[r*col]) * Rx_1 + \n                            (amb_temp - temp[r*col]) * Rz_1);\n                    }\n                    result[r*col+c] =temp[r*col+c]+ delta;\n                }\n            }\n            continue;\n        }\n\n        for ( r = r_start; r < r_start + BLOCK_SIZE_R; ++r ) {\n#pragma omp simd        \n            for ( c = c_start; c < c_start + BLOCK_SIZE_C; ++c ) {\n            /* Update Temperatures */\n                result[r*col+c] =temp[r*col+c]+ \n                     ( Cap_1 * (power[r*col+c] + \n                    (temp[(r+1)*col+c] + temp[(r-1)*col+c] - 2.f*temp[r*col+c]) * Ry_1 + \n                    (temp[r*col+c+1] + temp[r*col+c-1] - 2.f*temp[r*col+c]) * Rx_1 + \n                    (amb_temp - temp[r*col+c]) * Rz_1));\n            }\n        }\n    }\n}\n\n#ifdef OMP_OFFLOAD\n#pragma offload_attribute(pop)\n#endif\n\n/* Transient solver driver routine: simply converts the heat \n * transfer differential equations to difference equations \n * and solves the difference equations by iterating\n */\nvoid compute_tran_temp(FLOAT *result, int num_iterations, FLOAT *temp, FLOAT *power, int row, int col) \n{\n\t#ifdef VERBOSE\n\tint i = 0;\n\t#endif\n\n\tFLOAT grid_height = chip_height / row;\n\tFLOAT grid_width = chip_width / col;\n\n\tFLOAT Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;\n\tFLOAT Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);\n\tFLOAT Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);\n\tFLOAT Rz = t_chip / (K_SI * grid_height * grid_width);\n\n\tFLOAT max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);\n    FLOAT step = PRECISION / max_slope / 1000.0;\n\n    FLOAT Rx_1=1.f/Rx;\n    FLOAT Ry_1=1.f/Ry;\n    FLOAT Rz_1=1.f/Rz;\n    FLOAT Cap_1 = step/Cap;\n\t#ifdef VERBOSE\n\tfprintf(stdout, \"total iterations: %d s\\tstep size: %g s\\n\", num_iterations, step);\n\tfprintf(stdout, \"Rx: %g\\tRy: %g\\tRz: %g\\tCap: %g\\n\", Rx, Ry, Rz, Cap);\n\t#endif\n\n#ifdef OMP_OFFLOAD\n        int array_size = row*col;\n#pragma omp target \\\n        map(temp[0:array_size]) \\\n        map(to: power[0:array_size], row, col, Cap_1, Rx_1, Ry_1, Rz_1, step, num_iterations) \\\n        map( result[0:array_size])\n#endif\n        {\n            FLOAT* r = result;\n            FLOAT* t = temp;\n            for (int i = 0; i < num_iterations ; i++)\n            {\n                #ifdef VERBOSE\n                fprintf(stdout, \"iteration %d\\n\", i++);\n                #endif\n                single_iteration(r, t, power, row, col, Cap_1, Rx_1, Ry_1, Rz_1, step);\n                FLOAT* tmp = t;\n                t = r;\n                r = tmp;\n            }\t\n        }\n\t#ifdef VERBOSE\n\tfprintf(stdout, \"iteration %d\\n\", i++);\n\t#endif\n}\n\nvoid fatal(char *s)\n{\n\tfprintf(stderr, \"error: %s\\n\", s);\n\texit(1);\n}\n\nvoid writeoutput(FLOAT *vect, int grid_rows, int grid_cols, char *file) {\n\n    int i,j, index=0;\n    FILE *fp;\n    char str[STR_SIZE];\n\n    if( (fp = fopen(file, \"w\" )) == 0 )\n        printf( \"The file was not opened\\n\" );\n\n\n    for (i=0; i < grid_rows; i++) \n        for (j=0; j < grid_cols; j++)\n        {\n\n            sprintf(str, \"%d\\t%g\\n\", index, vect[i*grid_cols+j]);\n            fputs(str,fp);\n            index++;\n        }\n\n    fclose(fp);\t\n}\n\nvoid read_input(FLOAT *vect, int grid_rows, int grid_cols, char *file)\n{\n  \tint i, index;\n\tFILE *fp;\n\tchar str[STR_SIZE];\n\tFLOAT val;\n\n\tfp = fopen (file, \"r\");\n\tif (!fp)\n\t\tfatal (\"file could not be opened for reading\");\n\n\tfor (i=0; i < grid_rows * grid_cols; i++) {\n\t\tfgets(str, STR_SIZE, fp);\n\t\tif (feof(fp))\n\t\t\tfatal(\"not enough lines in file\");\n\t\tif ((sscanf(str, \"%f\", &val) != 1) )\n\t\t\tfatal(\"invalid file format\");\n\t\tvect[i] = val;\n\t}\n\n\tfclose(fp);\t\n}\n\nvoid usage(int argc, char **argv)\n{\n\tfprintf(stderr, \"Usage: %s <grid_rows> <grid_cols> <sim_time> <no. of threads><temp_file> <power_file>\\n\", argv[0]);\n\tfprintf(stderr, \"\\t<grid_rows>  - number of rows in the grid (positive integer)\\n\");\n\tfprintf(stderr, \"\\t<grid_cols>  - number of columns in the grid (positive integer)\\n\");\n\tfprintf(stderr, \"\\t<sim_time>   - number of iterations\\n\");\n\tfprintf(stderr, \"\\t<no. of threads>   - number of threads\\n\");\n\tfprintf(stderr, \"\\t<temp_file>  - name of the file containing the initial temperature values of each cell\\n\");\n\tfprintf(stderr, \"\\t<power_file> - name of the file containing the dissipated power values of each cell\\n\");\n        fprintf(stderr, \"\\t<output_file> - name of the output file\\n\");\n\texit(1);\n}\n\nint main(int argc, char **argv)\n{\n\tint grid_rows, grid_cols, sim_time, i;\n\tFLOAT *temp, *power, *result;\n\tchar *tfile, *pfile, *ofile;\n\t\n\t/* check validity of inputs\t*/\n\tif (argc != 8)\n\t\tusage(argc, argv);\n\tif ((grid_rows = atoi(argv[1])) <= 0 ||\n\t\t(grid_cols = atoi(argv[2])) <= 0 ||\n\t\t(sim_time = atoi(argv[3])) <= 0 || \n\t\t(num_omp_threads = atoi(argv[4])) <= 0\n\t\t)\n\t\tusage(argc, argv);\n\n\t/* allocate memory for the temperature and power arrays\t*/\n\ttemp = (FLOAT *) calloc (grid_rows * grid_cols, sizeof(FLOAT));\n\tpower = (FLOAT *) calloc (grid_rows * grid_cols, sizeof(FLOAT));\n\tresult = (FLOAT *) calloc (grid_rows * grid_cols, sizeof(FLOAT));\n\tif(!temp || !power)\n\t\tfatal(\"unable to allocate memory\");\n\n\t/* read initial temperatures and input power\t*/\n\ttfile = argv[5];\n\tpfile = argv[6];\n    ofile = argv[7];\n\n\tread_input(temp, grid_rows, grid_cols, tfile);\n\tread_input(power, grid_rows, grid_cols, pfile);\n\n\tprintf(\"Start computing the transient temperature\\n\");\n\t\n    long long start_time = get_time();\n\n    compute_tran_temp(result,sim_time, temp, power, grid_rows, grid_cols);\n\n    long long end_time = get_time();\n\n    printf(\"Ending simulation\\n\");\n    printf(\"Total time: %.3f seconds\\n\", ((float) (end_time - start_time)) / (1000*1000));\n\n    writeoutput((1&sim_time) ? result : temp, grid_rows, grid_cols, ofile);\n\n\t/* output results\t*/\n#ifdef VERBOSE\n\tfprintf(stdout, \"Final Temperatures:\\n\");\n#endif\n\n#ifdef OUTPUT\n\tfor(i=0; i < grid_rows * grid_cols; i++)\n\tfprintf(stdout, \"%d\\t%g\\n\", i, temp[i]);\n#endif\n\t/* cleanup\t*/\n\tfree(temp);\n\tfree(power);\n\n\treturn 0;\n}\n/* vim: set ts=4 sw=4  sts=4 et si ai: */", "label": 2}
{"code": "#include <stdio.h>\n#include <string.h>\n#include <math.h>\n#include <stdlib.h>\n#include <omp.h>\n//#define NUM_THREAD 4\n#define OPEN\n\n\nFILE *fp;\n\n//Structure to hold a node information\nstruct Node\n{\n\tint starting;\n\tint no_of_edges;\n};\n\nvoid BFSGraph(int argc, char** argv);\n\nvoid Usage(int argc, char**argv){\n\nfprintf(stderr,\"Usage: %s <num_threads> <input_file>\\n\", argv[0]);\n\n}\n////////////////////////////////////////////////////////////////////////////////\n// Main Program\n////////////////////////////////////////////////////////////////////////////////\nint main( int argc, char** argv) \n{\n\tBFSGraph( argc, argv);\n}\n\n\n\n////////////////////////////////////////////////////////////////////////////////\n//Apply BFS on a Graph using CUDA\n////////////////////////////////////////////////////////////////////////////////\nvoid BFSGraph( int argc, char** argv) \n{\n        int no_of_nodes = 0;\n        int edge_list_size = 0;\n        char *input_f;\n\tint\t num_omp_threads;\n\t\n\tif(argc!=3){\n\tUsage(argc, argv);\n\texit(0);\n\t}\n    \n\tnum_omp_threads = atoi(argv[1]);\n\tinput_f = argv[2];\n\t\n\tprintf(\"Reading File\\n\");\n\t//Read in Graph from a file\n\tfp = fopen(input_f,\"r\");\n\tif(!fp)\n\t{\n\t\tprintf(\"Error Reading graph file\\n\");\n\t\treturn;\n\t}\n\n\tint source = 0;\n\n\tfscanf(fp,\"%d\",&no_of_nodes);\n   \n\t// allocate host memory\n\tNode* h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes);\n\tbool *h_graph_mask = (bool*) malloc(sizeof(bool)*no_of_nodes);\n\tbool *h_updating_graph_mask = (bool*) malloc(sizeof(bool)*no_of_nodes);\n\tbool *h_graph_visited = (bool*) malloc(sizeof(bool)*no_of_nodes);\n\n\tint start, edgeno;   \n\t// initalize the memory\n\tfor( unsigned int i = 0; i < no_of_nodes; i++) \n\t{\n\t\tfscanf(fp,\"%d %d\",&start,&edgeno);\n\t\th_graph_nodes[i].starting = start;\n\t\th_graph_nodes[i].no_of_edges = edgeno;\n\t\th_graph_mask[i]=false;\n\t\th_updating_graph_mask[i]=false;\n\t\th_graph_visited[i]=false;\n\t}\n\n\t//read the source node from the file\n\tfscanf(fp,\"%d\",&source);\n\t// source=0; //tesing code line\n\n\t//set the source node as true in the mask\n\th_graph_mask[source]=true;\n\th_graph_visited[source]=true;\n\n\tfscanf(fp,\"%d\",&edge_list_size);\n\n\tint id,cost;\n\tint* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size);\n\tfor(int i=0; i < edge_list_size ; i++)\n\t{\n\t\tfscanf(fp,\"%d\",&id);\n\t\tfscanf(fp,\"%d\",&cost);\n\t\th_graph_edges[i] = id;\n\t}\n\n\tif(fp)\n\t\tfclose(fp);    \n\n\n\t// allocate mem for the result on host side\n\tint* h_cost = (int*) malloc( sizeof(int)*no_of_nodes);\n\tfor(int i=0;i<no_of_nodes;i++)\n\t\th_cost[i]=-1;\n\th_cost[source]=0;\n\t\n\tprintf(\"Start traversing the tree\\n\");\n\t\n\tint k=0;\n#ifdef OPEN\n        double start_time = omp_get_wtime();\n#ifdef OMP_OFFLOAD\n#pragma omp target data map(to: no_of_nodes, h_graph_mask[0:no_of_nodes], h_graph_nodes[0:no_of_nodes], h_graph_edges[0:edge_list_size], h_graph_visited[0:no_of_nodes], h_updating_graph_mask[0:no_of_nodes]) map(h_cost[0:no_of_nodes])\n        {\n#endif \n#endif\n\tbool stop;\n\tdo\n        {\n            //if no thread changes this value then the loop stops\n            stop=false;\n\n#ifdef OPEN\n            //omp_set_num_threads(num_omp_threads);\n    #ifdef OMP_OFFLOAD\n    #pragma omp target\n    #endif\n    #pragma omp parallel for \n#endif \n            for(int tid = 0; tid < no_of_nodes; tid++ )\n            {\n                if (h_graph_mask[tid] == true){ \n                    h_graph_mask[tid]=false;\n                    for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++)\n                    {\n                        int id = h_graph_edges[i];\n                        if(!h_graph_visited[id])\n                        {\n                            h_cost[id]=h_cost[tid]+1;\n                            h_updating_graph_mask[id]=true;\n                        }\n                    }\n                }\n            }\n\n#ifdef OPEN\n    #ifdef OMP_OFFLOAD\n    #pragma omp target map(stop)\n    #endif\n    #pragma omp parallel for\n#endif\n            for(int tid=0; tid< no_of_nodes ; tid++ )\n            {\n                if (h_updating_graph_mask[tid] == true){\n                    h_graph_mask[tid]=true;\n                    h_graph_visited[tid]=true;\n                    stop=true;\n                    h_updating_graph_mask[tid]=false;\n                }\n            }\n            k++;\n        }\n\twhile(stop);\n#ifdef OPEN\n        double end_time = omp_get_wtime();\n        printf(\"Compute time: %lf\\n\", (end_time - start_time));\n#ifdef OMP_OFFLOAD\n        }\n#endif\n#endif\n\t//Store the result into a file\n\tFILE *fpo = fopen(\"result.txt\",\"w\");\n\tfor(int i=0;i<no_of_nodes;i++)\n\t\tfprintf(fpo,\"%d) cost:%d\\n\",i,h_cost[i]);\n\tfclose(fpo);\n\tprintf(\"Result stored in result.txt\\n\");\n\n\n\t// cleanup memory\n\tfree( h_graph_nodes);\n\tfree( h_graph_edges);\n\tfree( h_graph_mask);\n\tfree( h_updating_graph_mask);\n\tfree( h_graph_visited);\n\tfree( h_cost);\n\n}\n", "label": 2}
{"code": "#include <stdio.h>\n#include <stdlib.h>\n#include <time.h>\n#include <assert.h>\n\n#include \"timer.h\"\n\nvoid run(int argc, char** argv);\n\n/* define timer macros */\n#define pin_stats_reset()   startCycle()\n#define pin_stats_pause(cycles)   stopCycle(cycles)\n#define pin_stats_dump(cycles)    printf(\"timer: %Lu\\n\", cycles)\n\n#define BENCH_PRINT\n\nint rows, cols;\nint* data;\nint** wall;\nint* result;\n#define M_SEED 9\n\nvoid\ninit(int argc, char** argv)\n{\n\tif(argc==3){\n\t\tcols = atoi(argv[1]);\n\t\trows = atoi(argv[2]);\n\t}else{\n                printf(\"Usage: pathfiner width num_of_steps\\n\");\n                exit(0);\n        }\n\tdata = new int[rows*cols];\n\twall = new int*[rows];\n\tfor(int n=0; n<rows; n++)\n\t\twall[n]=data+cols*n;\n\tresult = new int[cols];\n\t\n\tint seed = M_SEED;\n\tsrand(seed);\n\n\tfor (int i = 0; i < rows; i++)\n    {\n        for (int j = 0; j < cols; j++)\n        {\n            wall[i][j] = rand() % 10;\n        }\n    }\n    for (int j = 0; j < cols; j++)\n        result[j] = wall[0][j];\n#ifdef BENCH_PRINT\n    for (int i = 0; i < rows; i++)\n    {\n        for (int j = 0; j < cols; j++)\n        {\n            printf(\"%d \",wall[i][j]) ;\n        }\n        printf(\"\\n\") ;\n    }\n#endif\n}\n\nvoid \nfatal(char *s)\n{\n\tfprintf(stderr, \"error: %s\\n\", s);\n\n}\n\n#define IN_RANGE(x, min, max)   ((x)>=(min) && (x)<=(max))\n#define CLAMP_RANGE(x, min, max) x = (x<(min)) ? min : ((x>(max)) ? max : x )\n#define MIN(a, b) ((a)<=(b) ? (a) : (b))\n\nint main(int argc, char** argv)\n{\n    run(argc,argv);\n\n    return EXIT_SUCCESS;\n}\n\nvoid run(int argc, char** argv)\n{\n    init(argc, argv);\n\n    unsigned long long cycles;\n\n    int *src, *dst, *temp;\n    int min;\n\n    dst = result;\n    src = new int[cols];\n\n    pin_stats_reset();\n    for (int t = 0; t < rows-1; t++) {\n        temp = src;\n        src = dst;\n        dst = temp;\n        #pragma omp parallel for private(min)\n        for(int n = 0; n < cols; n++){\n          min = src[n];\n          if (n > 0)\n            min = MIN(min, src[n-1]);\n          if (n < cols-1)\n            min = MIN(min, src[n+1]);\n          dst[n] = wall[t+1][n]+min;\n        }\n    }\n\n    pin_stats_pause(cycles);\n    pin_stats_dump(cycles);\n\n#ifdef BENCH_PRINT\n\n    for (int i = 0; i < cols; i++)\n\n            printf(\"%d \",data[i]) ;\n\n    printf(\"\\n\") ;\n\n    for (int i = 0; i < cols; i++)\n\n            printf(\"%d \",dst[i]) ;\n\n    printf(\"\\n\") ;\n\n#endif\n\n    delete [] data;\n    delete [] wall;\n    delete [] dst;\n    delete [] src;\n}\n", "label": 2}
{"code": "#include <stdio.h>\n#include <stdlib.h>\n#include <time.h>\n#include <assert.h>\n\n#include \"timer.h\"\n\nvoid run(int argc, char** argv);\n\n/* define timer macros */\n#define pin_stats_reset()   startCycle()\n#define pin_stats_pause(cycles)   stopCycle(cycles)\n#define pin_stats_dump(cycles)    printf(\"timer: %Lu\\n\", cycles)\n\n#define BENCH_PRINT\n\nint rows, cols;\nint* data;\nint** wall;\nint* result;\n#define M_SEED 9\n\nvoid\ninit(int argc, char** argv)\n{\n\tif(argc==3){\n\t\tcols = atoi(argv[1]);\n\t\trows = atoi(argv[2]);\n\t}else{\n                printf(\"Usage: pathfiner width num_of_steps\\n\");\n                exit(0);\n        }\n\tdata = new int[rows*cols];\n\twall = new int*[rows];\n\tfor(int n=0; n<rows; n++)\n\t\twall[n]=data+cols*n;\n\tresult = new int[cols];\n\t\n\tint seed = M_SEED;\n\tsrand(seed);\n\n\tfor (int i = 0; i < rows; i++)\n    {\n        for (int j = 0; j < cols; j++)\n        {\n            wall[i][j] = rand() % 10;\n        }\n    }\n    for (int j = 0; j < cols; j++)\n        result[j] = wall[0][j];\n#ifdef BENCH_PRINT\n    for (int i = 0; i < rows; i++)\n    {\n        for (int j = 0; j < cols; j++)\n        {\n            printf(\"%d \",wall[i][j]) ;\n        }\n        printf(\"\\n\") ;\n    }\n#endif\n}\n\nvoid \nfatal(char *s)\n{\n\tfprintf(stderr, \"error: %s\\n\", s);\n\n}\n\n#define IN_RANGE(x, min, max)   ((x)>=(min) && (x)<=(max))\n#define CLAMP_RANGE(x, min, max) x = (x<(min)) ? min : ((x>(max)) ? max : x )\n#define MIN(a, b) ((a)<=(b) ? (a) : (b))\n\nint main(int argc, char** argv)\n{\n    run(argc,argv);\n\n    return EXIT_SUCCESS;\n}\n\nvoid run(int argc, char** argv)\n{\n    init(argc, argv);\n\n    unsigned long long cycles;\n\n    int *src, *dst, *temp;\n    int min;\n\n    dst = result;\n    src = new int[cols];\n\n    pin_stats_reset();\n    for (int t = 0; t < rows-1; t++) {\n        temp = src;\n        src = dst;\n        dst = temp;\n        #pragma omp parallel for private(min)\n        for(int n = 0; n < cols; n++){\n          min = src[n];\n          if (n > 0)\n            min = MIN(min, src[n-1]);\n          if (n < cols-1)\n            min = MIN(min, src[n+1]);\n          dst[n] = wall[t+1][n]+min;\n        }\n    }\n\n    pin_stats_pause(cycles);\n    pin_stats_dump(cycles);\n\n#ifdef BENCH_PRINT\n\n    for (int i = 0; i < cols; i++)\n\n            printf(\"%d \",data[i]) ;\n\n    printf(\"\\n\") ;\n\n    for (int i = 0; i < cols; i++)\n\n            printf(\"%d \",dst[i]) ;\n\n    printf(\"\\n\") ;\n\n#endif\n\n    delete [] data;\n    delete [] wall;\n    delete [] dst;\n    delete [] src;\n}\n", "label": 2}