@@ -83,6 +83,7 @@ MAIN_ENV
8383#include "topology.h"
8484
8585std ::map < long , std ::multiset < double * >> threadid_addresses_map ;
86+ pthread_spinlock_t map_spinlock ;
8687
8788string defv [] = { /* DEFAULT PARAMETER VALUES */
8889 /* file names for input/output */
@@ -336,6 +337,8 @@ int main (int argc, string argv[])
336337 }
337338
338339 Global = NULL ;
340+
341+ pthread_spin_init (& map_spinlock , PTHREAD_PROCESS_SHARED );
339342 initparam (defv ); // modify initparam to read input from stdin only once
340343 startrun (); // create another version of this function that reuses loaded data
341344 initoutput (); // no need for modification, can be repeated
@@ -372,7 +375,7 @@ int main (int argc, string argv[])
372375 std ::cout << std ::endl ;
373376 assert (base_assigned_cores .size () == 28 );
374377
375- CREATE (SlaveStart , static_cast < void * > (base_assigned_cores .data ()), NPROC );
378+ CREATE (SlaveStart < true > , static_cast < void * > (base_assigned_cores .data ()), NPROC );
376379
377380 WAIT_FOR_END (NPROC );
378381
@@ -562,7 +565,7 @@ int main (int argc, string argv[])
562565 //assert(__threads__<__MAX_THREADS__);
563566 const auto cha_aware_start = high_resolution_clock ::now ();
564567
565- CREATE (SlaveStart , static_cast < void * > (/* thread_to_core.data()*/ base_assigned_cores .data ()), NPROC );
568+ CREATE (SlaveStart < false > , static_cast < void * > (thread_to_core .data () /* base_assigned_cores.data()*/ ), NPROC );
566569
567570 WAIT_FOR_END (NPROC );
568571 // std::cout << "AFTER JOIN. ended cha aware bm" << std::endl;
@@ -588,7 +591,7 @@ int main (int argc, string argv[])
588591 std ::cout << "Now running base BM" << std ::endl ;
589592 const auto base_start = high_resolution_clock ::now ();
590593
591- CREATE (SlaveStart , static_cast < void * > (base_assigned_cores .data ()), NPROC );
594+ CREATE (SlaveStart < false > , static_cast < void * > (base_assigned_cores .data ()), NPROC );
592595
593596 WAIT_FOR_END (NPROC );
594597
@@ -600,7 +603,8 @@ int main (int argc, string argv[])
600603
601604 //std::cout << "latency improv percentage: " << ((elapsed_base - elapsed_cha_aware) / static_cast<double>(elapsed_base)) * 100 << std::endl;
602605//#endif
603-
606+ pthread_spin_destroy (& map_spinlock );
607+ //std::cerr << "after pthread_spin_destroy\n";
604608 MAIN_END ;
605609}
606610
@@ -767,6 +771,7 @@ void stick_this_thread_to_core(int core_id) {
767771/*
768772 * SLAVESTART: main task for each processor
769773 */
774+ template < bool is_preprocessing >
770775void SlaveStart (void * data )
771776{
772777 //printf("SlaveStart begins\n");
@@ -832,7 +837,7 @@ void SlaveStart(void* data)
832837
833838 /* main loop */
834839 while (Local [ProcessId ].tnow < tstop + 0.1 * dtime ) {
835- stepsystem (ProcessId );
840+ stepsystem < is_preprocessing > (ProcessId );
836841// printtree(Global->G_root);
837842 //printf("Going to next step!!!\n");
838843 }
@@ -1099,11 +1104,280 @@ long intpow(long i, long j)
10991104 return temp ;
11001105}
11011106
1107+ /*
1108+ * MAKETREE: initialize tree structure for hack force calculation.
1109+ */
1110+ template < bool is_preprocessing >
1111+ void maketree (long ProcessId )
1112+ {
1113+ bodyptr p , * pp ;
1114+
1115+ Local [ProcessId ].myncell = 0 ;
1116+ Local [ProcessId ].mynleaf = 0 ;
1117+ if (ProcessId == 0 ) {
1118+ Local [ProcessId ].mycelltab [Local [ProcessId ].myncell ++ ] = Global -> G_root ;
1119+ }
1120+ Local [ProcessId ].Current_Root = (nodeptr ) Global -> G_root ;
1121+ for (pp = Local [ProcessId ].mybodytab ;
1122+ pp < Local [ProcessId ].mybodytab + Local [ProcessId ].mynbody ; pp ++ ) {
1123+ p = * pp ;
1124+ if (Mass (p ) != 0.0 ) {
1125+ Local [ProcessId ].Current_Root
1126+ = (nodeptr ) loadtree < is_preprocessing > (p , (cellptr ) Local [ProcessId ].Current_Root ,
1127+ ProcessId );
1128+ }
1129+ else {
1130+ LOCK (Global -> io_lock );
1131+ fprintf (stderr , "Process %ld found body %ld to have zero mass\n" ,
1132+ ProcessId , (long ) p );
1133+ UNLOCK (Global -> io_lock );
1134+ }
1135+ }
1136+
1137+ {
1138+ unsigned long Error , Cycle ;
1139+ long Cancel , Temp ;
1140+
1141+ Error = pthread_mutex_lock (& (Global -> Barrier ).mutex );
1142+ if (Error != 0 ) {
1143+ printf ("Error while trying to get lock in barrier.\n" );
1144+ exit (-1 );
1145+ }
1146+
1147+ Cycle = (Global -> Barrier ).cycle ;
1148+ if (++ (Global -> Barrier ).counter != (NPROC )) {
1149+ pthread_setcancelstate (PTHREAD_CANCEL_DISABLE , (int * ) & Cancel );
1150+ while (Cycle == (Global -> Barrier ).cycle ) {
1151+ Error = pthread_cond_wait (& (Global -> Barrier ).cv , & (Global -> Barrier ).mutex );
1152+ if (Error != 0 ) {
1153+ break ;
1154+ }
1155+ }
1156+ pthread_setcancelstate (Cancel , (int * ) & Temp );
1157+ } else {
1158+ (Global -> Barrier ).cycle = !(Global -> Barrier ).cycle ;
1159+ (Global -> Barrier ).counter = 0 ;
1160+ Error = pthread_cond_broadcast (& (Global -> Barrier ).cv );
1161+ }
1162+ pthread_mutex_unlock (& (Global -> Barrier ).mutex );
1163+ }
1164+
1165+ hackcofm (ProcessId );
1166+ {
1167+ unsigned long Error , Cycle ;
1168+ long Cancel , Temp ;
1169+
1170+ Error = pthread_mutex_lock (& (Global -> Barrier ).mutex );
1171+ if (Error != 0 ) {
1172+ printf ("Error while trying to get lock in barrier.\n" );
1173+ exit (-1 );
1174+ }
1175+
1176+ Cycle = (Global -> Barrier ).cycle ;
1177+ if (++ (Global -> Barrier ).counter != (NPROC )) {
1178+ pthread_setcancelstate (PTHREAD_CANCEL_DISABLE , (int * ) & Cancel );
1179+ while (Cycle == (Global -> Barrier ).cycle ) {
1180+ Error = pthread_cond_wait (& (Global -> Barrier ).cv , & (Global -> Barrier ).mutex );
1181+ if (Error != 0 ) {
1182+ break ;
1183+ }
1184+ }
1185+ pthread_setcancelstate (Cancel , (int * ) & Temp );
1186+ } else {
1187+ (Global -> Barrier ).cycle = !(Global -> Barrier ).cycle ;
1188+ (Global -> Barrier ).counter = 0 ;
1189+ Error = pthread_cond_broadcast (& (Global -> Barrier ).cv );
1190+ }
1191+ pthread_mutex_unlock (& (Global -> Barrier ).mutex );
1192+ }
1193+ }
1194+
1195+ /*
1196+ * * LOADTREE: descend tree and insert particle.
1197+ * */
1198+ template < bool is_preprocessing >
1199+ nodeptr loadtree (bodyptr p , cellptr root , long ProcessId )
1200+ {
1201+ long l , xp [NDIM ], xor_arr [NDIM ], flag ;
1202+ long i , j , root_level ;
1203+ bool valid_root ;
1204+ long kidIndex ;
1205+ volatile nodeptr * volatile qptr , mynode ;
1206+ leafptr le ;
1207+
1208+ intcoord (xp , Pos (p ));
1209+ valid_root = TRUE;
1210+ for (i = 0 ; i < NDIM ; i ++ ) {
1211+ xor_arr [i ] = xp [i ] ^ Local [ProcessId ].Root_Coords [i ];
1212+ }
1213+ for (i = IMAX >> 1 ; i > Level (root ); i >>= 1 ) {
1214+ for (j = 0 ; j < NDIM ; j ++ ) {
1215+ if (xor_arr [j ] & i ) {
1216+ valid_root = FALSE;
1217+ break ;
1218+ }
1219+ }
1220+ if (!valid_root ) {
1221+ break ;
1222+ }
1223+ }
1224+ if (!valid_root ) {
1225+ if (root != Global -> G_root ) {
1226+ root_level = Level (root );
1227+ for (j = i ; j > root_level ; j >>= 1 ) {
1228+ root = (cellptr ) Parent (root );
1229+ }
1230+ valid_root = TRUE;
1231+ for (i = IMAX >> 1 ; i > Level (root ); i >>= 1 ) {
1232+ for (j = 0 ; j < NDIM ; j ++ ) {
1233+ if (xor_arr [j ] & i ) {
1234+ valid_root = FALSE;
1235+ break ;
1236+ }
1237+ }
1238+ if (!valid_root ) {
1239+ printf ("P%ld body %ld\n" , ProcessId , p - bodytab );
1240+ root = Global -> G_root ;
1241+ }
1242+ }
1243+ }
1244+ }
1245+ root = Global -> G_root ;
1246+ mynode = (nodeptr ) root ;
1247+ kidIndex = subindex (xp , Level (mynode ));
1248+ qptr = & Subp (mynode )[kidIndex ];
1249+
1250+ l = Level (mynode ) >> 1 ;
1251+ flag = TRUE;
1252+ while (flag ) { /* loop descending tree */
1253+ if (l == 0 ) {
1254+ error ("not enough levels in tree\n" );
1255+ }
1256+ if (* qptr == NULL ) {
1257+ /* lock the parent cell */
1258+ ALOCK (CellLock -> CL , ((cellptr ) mynode )-> seqnum % MAXLOCK );
1259+ if constexpr (is_preprocessing )
1260+ {
1261+ pthread_spin_lock (& map_spinlock );
1262+
1263+ threadid_addresses_map [ProcessId ].insert (
1264+ reinterpret_cast < double * > (
1265+ reinterpret_cast < uintptr_t > (& (CellLock -> CL )) &
1266+ ~(CACHELINE_SIZE - 1 )
1267+ )
1268+ );
1269+
1270+ threadid_addresses_map [ProcessId ].insert (
1271+ reinterpret_cast < double * > (
1272+ reinterpret_cast < uintptr_t > (& (((cellptr ) mynode )-> seqnum )) &
1273+ ~(CACHELINE_SIZE - 1 )
1274+ )
1275+ );
1276+ pthread_spin_unlock (& map_spinlock );
1277+ }
1278+ if (* qptr == NULL ) {
1279+ le = InitLeaf ((cellptr ) mynode , ProcessId );
1280+ Parent (p ) = (nodeptr ) le ;
1281+ Level (p ) = l ;
1282+ ChildNum (p ) = le -> num_bodies ;
1283+ ChildNum (le ) = kidIndex ;
1284+ Bodyp (le )[le -> num_bodies ++ ] = p ;
1285+ * qptr = (nodeptr ) le ;
1286+ flag = FALSE;
1287+ }
1288+ AULOCK (CellLock -> CL , ((cellptr ) mynode )-> seqnum % MAXLOCK );
1289+ if constexpr (is_preprocessing )
1290+ {
1291+ pthread_spin_lock (& map_spinlock );
1292+
1293+ threadid_addresses_map [ProcessId ].insert (
1294+ reinterpret_cast < double * > (
1295+ reinterpret_cast < uintptr_t > (& (CellLock -> CL )) &
1296+ ~(CACHELINE_SIZE - 1 )
1297+ )
1298+ );
1299+ threadid_addresses_map [ProcessId ].insert (
1300+ reinterpret_cast < double * > (
1301+ reinterpret_cast < uintptr_t > (& (((cellptr ) mynode )-> seqnum )) &
1302+ ~(CACHELINE_SIZE - 1 )
1303+ )
1304+ );
1305+ pthread_spin_unlock (& map_spinlock );
1306+ }
1307+ /* unlock the parent cell */
1308+ }
1309+ if (flag && * qptr && (Type (* qptr ) == LEAF )) {
1310+ /* reached a "leaf"? */
1311+ ALOCK (CellLock -> CL , ((cellptr ) mynode )-> seqnum % MAXLOCK );
1312+ if constexpr (is_preprocessing )
1313+ {
1314+ pthread_spin_lock (& map_spinlock );
1315+
1316+ threadid_addresses_map [ProcessId ].insert (
1317+ reinterpret_cast < double * > (
1318+ reinterpret_cast < uintptr_t > (& (CellLock -> CL )) &
1319+ ~(CACHELINE_SIZE - 1 )
1320+ )
1321+ );
1322+ threadid_addresses_map [ProcessId ].insert (
1323+ reinterpret_cast < double * > (
1324+ reinterpret_cast < uintptr_t > (& (((cellptr ) mynode )-> seqnum )) &
1325+ ~(CACHELINE_SIZE - 1 )
1326+ )
1327+ );
1328+ pthread_spin_unlock (& map_spinlock );
1329+ }
1330+ /* lock the parent cell */
1331+ if (Type (* qptr ) == LEAF ) { /* still a "leaf"? */
1332+ le = (leafptr ) * qptr ;
1333+ if (le -> num_bodies == MAX_BODIES_PER_LEAF ) {
1334+ * qptr = (nodeptr ) SubdivideLeaf (le , (cellptr ) mynode , l ,
1335+ ProcessId );
1336+ }
1337+ else {
1338+ Parent (p ) = (nodeptr ) le ;
1339+ Level (p ) = l ;
1340+ ChildNum (p ) = le -> num_bodies ;
1341+ Bodyp (le )[le -> num_bodies ++ ] = p ;
1342+ flag = FALSE;
1343+ }
1344+ }
1345+ AULOCK (CellLock -> CL , ((cellptr ) mynode )-> seqnum % MAXLOCK );
1346+ if constexpr (is_preprocessing )
1347+ {
1348+ pthread_spin_lock (& map_spinlock );
1349+
1350+ threadid_addresses_map [ProcessId ].insert (
1351+ reinterpret_cast < double * > (
1352+ reinterpret_cast < uintptr_t > (& (CellLock -> CL )) &
1353+ ~(CACHELINE_SIZE - 1 )
1354+ )
1355+ );
1356+ threadid_addresses_map [ProcessId ].insert (
1357+ reinterpret_cast < double * > (
1358+ reinterpret_cast < uintptr_t > (& (((cellptr ) mynode )-> seqnum )) &
1359+ ~(CACHELINE_SIZE - 1 )
1360+ )
1361+ );
1362+ pthread_spin_unlock (& map_spinlock );
1363+ }
1364+ /* unlock the node */
1365+ }
1366+ if (flag ) {
1367+ mynode = * qptr ;
1368+ kidIndex = subindex (xp , l );
1369+ qptr = & Subp (* qptr )[kidIndex ]; /* move down one level */
1370+ l = l >> 1 ; /* and test next bit */
1371+ }
1372+ }
1373+ SETV (Local [ProcessId ].Root_Coords , xp );
1374+ return Parent ((leafptr ) * qptr );
1375+ }
11021376
11031377/*
11041378 * STEPSYSTEM: advance N-body system one time-step.
11051379 */
1106-
1380+ template < bool is_preprocessing >
11071381void stepsystem (long ProcessId )
11081382{
11091383 long i ;
@@ -1168,7 +1442,7 @@ void stepsystem(long ProcessId)
11681442 }
11691443
11701444 /* load bodies into tree */
1171- maketree (ProcessId );
1445+ maketree < is_preprocessing > (ProcessId );
11721446 if ((ProcessId == 0 ) && (Local [ProcessId ].nstep >= 2 )) {
11731447 CLOCK (treebuildend );
11741448 Global -> treebuildtime += treebuildend - treebuildstart ;
0 commit comments