Skip to content

Commit 67bd9ec

Browse files
committed
partial recording of communication for CHA-awareness mapping
1 parent e0b00a7 commit 67bd9ec

File tree

5 files changed

+298
-11
lines changed

5 files changed

+298
-11
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
TARGET = BARNES
22
OBJS = code.o code_io.o load.o grav.o getparam.o util.o cha.o topology.o
3+
#OBJS = code.o code_io.o grav.o getparam.o util.o cha.o topology.o
34

45
#CC := gcc
56
CC := g++

code.C

Lines changed: 281 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ MAIN_ENV
8383
#include "topology.h"
8484

8585
std::map<long, std::multiset<double *>> threadid_addresses_map;
86+
pthread_spinlock_t map_spinlock;
8687

8788
string defv[] = { /* DEFAULT PARAMETER VALUES */
8889
/* file names for input/output */
@@ -336,6 +337,8 @@ int main (int argc, string argv[])
336337
}
337338

338339
Global = NULL;
340+
341+
pthread_spin_init(&map_spinlock, PTHREAD_PROCESS_SHARED);
339342
initparam(defv); // modify initparam to read input from stdin only once
340343
startrun(); // create another version of this function that reuses loaded data
341344
initoutput(); // no need for modification, can be repeated
@@ -372,7 +375,7 @@ int main (int argc, string argv[])
372375
std::cout << std::endl;
373376
assert(base_assigned_cores.size() == 28);
374377

375-
CREATE(SlaveStart, static_cast<void*>(base_assigned_cores.data()), NPROC);
378+
CREATE(SlaveStart<true>, static_cast<void*>(base_assigned_cores.data()), NPROC);
376379

377380
WAIT_FOR_END(NPROC);
378381

@@ -562,7 +565,7 @@ int main (int argc, string argv[])
562565
//assert(__threads__<__MAX_THREADS__);
563566
const auto cha_aware_start = high_resolution_clock::now();
564567

565-
CREATE(SlaveStart, static_cast<void*>(/*thread_to_core.data()*/base_assigned_cores.data()), NPROC);
568+
CREATE(SlaveStart<false>, static_cast<void*>(thread_to_core.data() /*base_assigned_cores.data()*/), NPROC);
566569

567570
WAIT_FOR_END(NPROC);
568571
// std::cout << "AFTER JOIN. ended cha aware bm" << std::endl;
@@ -588,7 +591,7 @@ int main (int argc, string argv[])
588591
std::cout << "Now running base BM" << std::endl;
589592
const auto base_start = high_resolution_clock::now();
590593

591-
CREATE(SlaveStart, static_cast<void*>(base_assigned_cores.data()), NPROC);
594+
CREATE(SlaveStart<false>, static_cast<void*>(base_assigned_cores.data()), NPROC);
592595

593596
WAIT_FOR_END(NPROC);
594597

@@ -600,7 +603,8 @@ int main (int argc, string argv[])
600603

601604
//std::cout << "latency improv percentage: " << ((elapsed_base - elapsed_cha_aware) / static_cast<double>(elapsed_base)) * 100 << std::endl;
602605
//#endif
603-
606+
pthread_spin_destroy(&map_spinlock);
607+
//std::cerr << "after pthread_spin_destroy\n";
604608
MAIN_END;
605609
}
606610

@@ -767,6 +771,7 @@ void stick_this_thread_to_core(int core_id) {
767771
/*
768772
* SLAVESTART: main task for each processor
769773
*/
774+
template<bool is_preprocessing>
770775
void SlaveStart(void* data)
771776
{
772777
//printf("SlaveStart begins\n");
@@ -832,7 +837,7 @@ void SlaveStart(void* data)
832837

833838
/* main loop */
834839
while (Local[ProcessId].tnow < tstop + 0.1 * dtime) {
835-
stepsystem(ProcessId);
840+
stepsystem<is_preprocessing>(ProcessId);
836841
// printtree(Global->G_root);
837842
//printf("Going to next step!!!\n");
838843
}
@@ -1099,11 +1104,280 @@ long intpow(long i, long j)
10991104
return temp;
11001105
}
11011106

1107+
/*
1108+
* MAKETREE: initialize tree structure for hack force calculation.
1109+
*/
1110+
template<bool is_preprocessing>
1111+
void maketree(long ProcessId)
1112+
{
1113+
bodyptr p, *pp;
1114+
1115+
Local[ProcessId].myncell = 0;
1116+
Local[ProcessId].mynleaf = 0;
1117+
if (ProcessId == 0) {
1118+
Local[ProcessId].mycelltab[Local[ProcessId].myncell++] = Global->G_root;
1119+
}
1120+
Local[ProcessId].Current_Root = (nodeptr) Global->G_root;
1121+
for (pp = Local[ProcessId].mybodytab;
1122+
pp < Local[ProcessId].mybodytab+Local[ProcessId].mynbody; pp++) {
1123+
p = *pp;
1124+
if (Mass(p) != 0.0) {
1125+
Local[ProcessId].Current_Root
1126+
= (nodeptr) loadtree<is_preprocessing>(p, (cellptr) Local[ProcessId].Current_Root,
1127+
ProcessId);
1128+
}
1129+
else {
1130+
LOCK(Global->io_lock);
1131+
fprintf(stderr, "Process %ld found body %ld to have zero mass\n",
1132+
ProcessId, (long) p);
1133+
UNLOCK(Global->io_lock);
1134+
}
1135+
}
1136+
1137+
{
1138+
unsigned long Error, Cycle;
1139+
long Cancel, Temp;
1140+
1141+
Error = pthread_mutex_lock(&(Global->Barrier).mutex);
1142+
if (Error != 0) {
1143+
printf("Error while trying to get lock in barrier.\n");
1144+
exit(-1);
1145+
}
1146+
1147+
Cycle = (Global->Barrier).cycle;
1148+
if (++(Global->Barrier).counter != (NPROC)) {
1149+
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, (int *) &Cancel);
1150+
while (Cycle == (Global->Barrier).cycle) {
1151+
Error = pthread_cond_wait(&(Global->Barrier).cv, &(Global->Barrier).mutex);
1152+
if (Error != 0) {
1153+
break;
1154+
}
1155+
}
1156+
pthread_setcancelstate(Cancel, (int *) &Temp);
1157+
} else {
1158+
(Global->Barrier).cycle = !(Global->Barrier).cycle;
1159+
(Global->Barrier).counter = 0;
1160+
Error = pthread_cond_broadcast(&(Global->Barrier).cv);
1161+
}
1162+
pthread_mutex_unlock(&(Global->Barrier).mutex);
1163+
}
1164+
1165+
hackcofm(ProcessId );
1166+
{
1167+
unsigned long Error, Cycle;
1168+
long Cancel, Temp;
1169+
1170+
Error = pthread_mutex_lock(&(Global->Barrier).mutex);
1171+
if (Error != 0) {
1172+
printf("Error while trying to get lock in barrier.\n");
1173+
exit(-1);
1174+
}
1175+
1176+
Cycle = (Global->Barrier).cycle;
1177+
if (++(Global->Barrier).counter != (NPROC)) {
1178+
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, (int *) &Cancel);
1179+
while (Cycle == (Global->Barrier).cycle) {
1180+
Error = pthread_cond_wait(&(Global->Barrier).cv, &(Global->Barrier).mutex);
1181+
if (Error != 0) {
1182+
break;
1183+
}
1184+
}
1185+
pthread_setcancelstate(Cancel, (int *) &Temp);
1186+
} else {
1187+
(Global->Barrier).cycle = !(Global->Barrier).cycle;
1188+
(Global->Barrier).counter = 0;
1189+
Error = pthread_cond_broadcast(&(Global->Barrier).cv);
1190+
}
1191+
pthread_mutex_unlock(&(Global->Barrier).mutex);
1192+
}
1193+
}
1194+
1195+
/*
1196+
* * LOADTREE: descend tree and insert particle.
1197+
* */
1198+
template<bool is_preprocessing>
1199+
nodeptr loadtree(bodyptr p, cellptr root, long ProcessId)
1200+
{
1201+
long l, xp[NDIM], xor_arr[NDIM], flag;
1202+
long i, j, root_level;
1203+
bool valid_root;
1204+
long kidIndex;
1205+
volatile nodeptr *volatile qptr, mynode;
1206+
leafptr le;
1207+
1208+
intcoord(xp, Pos(p));
1209+
valid_root = TRUE;
1210+
for (i = 0; i < NDIM; i++) {
1211+
xor_arr[i] = xp[i] ^ Local[ProcessId].Root_Coords[i];
1212+
}
1213+
for (i = IMAX >> 1; i > Level(root); i >>= 1) {
1214+
for (j = 0; j < NDIM; j++) {
1215+
if (xor_arr[j] & i) {
1216+
valid_root = FALSE;
1217+
break;
1218+
}
1219+
}
1220+
if (!valid_root) {
1221+
break;
1222+
}
1223+
}
1224+
if (!valid_root) {
1225+
if (root != Global->G_root) {
1226+
root_level = Level(root);
1227+
for (j = i; j > root_level; j >>= 1) {
1228+
root = (cellptr) Parent(root);
1229+
}
1230+
valid_root = TRUE;
1231+
for (i = IMAX >> 1; i > Level(root); i >>= 1) {
1232+
for (j = 0; j < NDIM; j++) {
1233+
if (xor_arr[j] & i) {
1234+
valid_root = FALSE;
1235+
break;
1236+
}
1237+
}
1238+
if (!valid_root) {
1239+
printf("P%ld body %ld\n", ProcessId, p - bodytab);
1240+
root = Global->G_root;
1241+
}
1242+
}
1243+
}
1244+
}
1245+
root = Global->G_root;
1246+
mynode = (nodeptr) root;
1247+
kidIndex = subindex(xp, Level(mynode));
1248+
qptr = &Subp(mynode)[kidIndex];
1249+
1250+
l = Level(mynode) >> 1;
1251+
flag = TRUE;
1252+
while (flag) { /* loop descending tree */
1253+
if (l == 0) {
1254+
error("not enough levels in tree\n");
1255+
}
1256+
if (*qptr == NULL) {
1257+
/* lock the parent cell */
1258+
ALOCK(CellLock->CL, ((cellptr) mynode)->seqnum % MAXLOCK);
1259+
if constexpr (is_preprocessing)
1260+
{
1261+
pthread_spin_lock(&map_spinlock);
1262+
1263+
threadid_addresses_map[ProcessId].insert(
1264+
reinterpret_cast<double*>(
1265+
reinterpret_cast<uintptr_t>(&(CellLock->CL)) &
1266+
~(CACHELINE_SIZE - 1)
1267+
)
1268+
);
1269+
1270+
threadid_addresses_map[ProcessId].insert(
1271+
reinterpret_cast<double*>(
1272+
reinterpret_cast<uintptr_t>(&(((cellptr) mynode)->seqnum)) &
1273+
~(CACHELINE_SIZE - 1)
1274+
)
1275+
);
1276+
pthread_spin_unlock(&map_spinlock);
1277+
}
1278+
if (*qptr == NULL) {
1279+
le = InitLeaf((cellptr) mynode, ProcessId);
1280+
Parent(p) = (nodeptr) le;
1281+
Level(p) = l;
1282+
ChildNum(p) = le->num_bodies;
1283+
ChildNum(le) = kidIndex;
1284+
Bodyp(le)[le->num_bodies++] = p;
1285+
*qptr = (nodeptr) le;
1286+
flag = FALSE;
1287+
}
1288+
AULOCK(CellLock->CL, ((cellptr) mynode)->seqnum % MAXLOCK);
1289+
if constexpr (is_preprocessing)
1290+
{
1291+
pthread_spin_lock(&map_spinlock);
1292+
1293+
threadid_addresses_map[ProcessId].insert(
1294+
reinterpret_cast<double*>(
1295+
reinterpret_cast<uintptr_t>(&(CellLock->CL)) &
1296+
~(CACHELINE_SIZE - 1)
1297+
)
1298+
);
1299+
threadid_addresses_map[ProcessId].insert(
1300+
reinterpret_cast<double*>(
1301+
reinterpret_cast<uintptr_t>(&(((cellptr) mynode)->seqnum)) &
1302+
~(CACHELINE_SIZE - 1)
1303+
)
1304+
);
1305+
pthread_spin_unlock(&map_spinlock);
1306+
}
1307+
/* unlock the parent cell */
1308+
}
1309+
if (flag && *qptr && (Type(*qptr) == LEAF)) {
1310+
/* reached a "leaf"? */
1311+
ALOCK(CellLock->CL, ((cellptr) mynode)->seqnum % MAXLOCK);
1312+
if constexpr (is_preprocessing)
1313+
{
1314+
pthread_spin_lock(&map_spinlock);
1315+
1316+
threadid_addresses_map[ProcessId].insert(
1317+
reinterpret_cast<double*>(
1318+
reinterpret_cast<uintptr_t>(&(CellLock->CL)) &
1319+
~(CACHELINE_SIZE - 1)
1320+
)
1321+
);
1322+
threadid_addresses_map[ProcessId].insert(
1323+
reinterpret_cast<double*>(
1324+
reinterpret_cast<uintptr_t>(&(((cellptr) mynode)->seqnum)) &
1325+
~(CACHELINE_SIZE - 1)
1326+
)
1327+
);
1328+
pthread_spin_unlock(&map_spinlock);
1329+
}
1330+
/* lock the parent cell */
1331+
if (Type(*qptr) == LEAF) { /* still a "leaf"? */
1332+
le = (leafptr) *qptr;
1333+
if (le->num_bodies == MAX_BODIES_PER_LEAF) {
1334+
*qptr = (nodeptr) SubdivideLeaf(le, (cellptr) mynode, l,
1335+
ProcessId);
1336+
}
1337+
else {
1338+
Parent(p) = (nodeptr) le;
1339+
Level(p) = l;
1340+
ChildNum(p) = le->num_bodies;
1341+
Bodyp(le)[le->num_bodies++] = p;
1342+
flag = FALSE;
1343+
}
1344+
}
1345+
AULOCK(CellLock->CL, ((cellptr) mynode)->seqnum % MAXLOCK);
1346+
if constexpr (is_preprocessing)
1347+
{
1348+
pthread_spin_lock(&map_spinlock);
1349+
1350+
threadid_addresses_map[ProcessId].insert(
1351+
reinterpret_cast<double*>(
1352+
reinterpret_cast<uintptr_t>(&(CellLock->CL)) &
1353+
~(CACHELINE_SIZE - 1)
1354+
)
1355+
);
1356+
threadid_addresses_map[ProcessId].insert(
1357+
reinterpret_cast<double*>(
1358+
reinterpret_cast<uintptr_t>(&(((cellptr) mynode)->seqnum)) &
1359+
~(CACHELINE_SIZE - 1)
1360+
)
1361+
);
1362+
pthread_spin_unlock(&map_spinlock);
1363+
}
1364+
/* unlock the node */
1365+
}
1366+
if (flag) {
1367+
mynode = *qptr;
1368+
kidIndex = subindex(xp, l);
1369+
qptr = &Subp(*qptr)[kidIndex]; /* move down one level */
1370+
l = l >> 1; /* and test next bit */
1371+
}
1372+
}
1373+
SETV(Local[ProcessId].Root_Coords, xp);
1374+
return Parent((leafptr) *qptr);
1375+
}
11021376

11031377
/*
11041378
* STEPSYSTEM: advance N-body system one time-step.
11051379
*/
1106-
1380+
template<bool is_preprocessing>
11071381
void stepsystem(long ProcessId)
11081382
{
11091383
long i;
@@ -1168,7 +1442,7 @@ void stepsystem(long ProcessId)
11681442
}
11691443

11701444
/* load bodies into tree */
1171-
maketree(ProcessId);
1445+
maketree<is_preprocessing>(ProcessId);
11721446
if ((ProcessId == 0) && (Local[ProcessId].nstep >= 2)) {
11731447
CLOCK(treebuildend);
11741448
Global->treebuildtime += treebuildend - treebuildstart;

code.H

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,12 @@ struct local_memory {
135135
};
136136
global struct local_memory Local[MAX_PROC];
137137

138+
template<bool is_preprocessing>
138139
void SlaveStart(void* data);
140+
141+
template<bool is_preprocessing>
139142
void stepsystem(long ProcessId);
143+
140144
void ComputeForces(long ProcessId);
141145
void Help(void);
142146
void ANLinit(void);

0 commit comments

Comments
 (0)