-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlatencyMatrixMultiplicationAcc.py
More file actions
196 lines (169 loc) · 8.68 KB
/
Copy pathlatencyMatrixMultiplicationAcc.py
File metadata and controls
196 lines (169 loc) · 8.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import numpy as np
import matplotlib.pyplot as plt
import time
import json
import os
# Define cache, memory, and accelerator properties
Lmem = 400 # Latency for main memory in cycles
L2mem = 50 # Latency for L2 cache in cycles
L1mem = 5 # Latency for L1 cache in cycles
Lregister = L1mem # Latency for buffer register near/inside DAC
Lconversion = 2 # Latency for data conversion
L2L1rateTransfer = 64 #bytes per cycle
M = 16 # Minimum size of vector for accelerator
Lacc = 2 + Lconversion + Lregister # Latency for accelerator dot product in cyclesLconversion of memory access
L2_size = 10*1024**2 #2**16 # L2 cache size in bytes
L1_size = 128*1024#2**14 # L1 cache size in bytes
LoadingTimeL2L1 = L2mem*L1_size/L2L1rateTransfer # time to transfer from L2 to L1 x number of transfer
float_size = 2 # Size of a float in bytes (assuming 32-bit float)
pL1 = 0.001 # probability that the data is not in L1 and needs to be seek in L2
parameters = {'L1mem':L1mem,'L2mem':L2mem,'Lmem':Lmem,'Lregister':Lregister,'Lconversion':Lconversion,'L2L1rateTransfer':L2L1rateTransfer,'Lacc':Lacc,'L2_size':L2_size,'L1_size':L1_size,'float_size':float_size,'pL1':pL1}
def dict_to_json_file(dictionary, filename):
"""
Convert a dictionary to JSON and save it in a file.
Parameters:
dictionary (dict): The dictionary to be converted to JSON
filename (str): The name of the file to save the JSON data
Returns:
None
"""
try:
# Open the file in write mode
with open(filename, 'w') as json_file:
# Use json.dump() to write the dictionary to the file in JSON format
# indent=4 makes the JSON file human-readable with proper indentation
json.dump(dictionary, json_file, indent=4)
print(f"JSON data successfully saved to {filename}")
except Exception as e:
print(f"An error occurred while saving the JSON file: {e}")
def compute_time_with_tiling_and_accelerator(N):
# Determine tile size based on L1 cache size
tile_elements = N**2 // L1_size # Max elements in a tile for L1 cache
tile_size = int(np.sqrt(L1_size)) # Tile dimension (tile_size x tile_size)
# print(f'Tile number: {tile_elements} of size {tile_size} elements')
# Initialize matrices A and B
A = np.random.rand(N, N)
B = np.random.rand(N, N)
C = np.zeros((N, N))
# Track total time in cycles
total_time = 0
offloaded = 0
op = 0
accessmemL1 = 0
accessmemL2 = 0
miss = 0
# Matrix multiplication simulation with cache hierarchy, tiling, and accelerator usage
for i in range(0, N, tile_size):
for j in range(0, N, tile_size):
for k in range(0, N, tile_size):
# For each tile, simulate loading into L1 cache
total_time += LoadingTimeL2L1
accessmemL2 += L1_size/L2L1rateTransfer
for ii in range(i, min(i + tile_size, N)):
for jj in range(j, min(j + tile_size, N)):
op += 1
# Use accelerator if the vector size allows (i.e., if M fits within the remaining dimensions)
if N - k >= M:
# Offload to accelerator
C[ii, jj] += np.dot(A[ii, k:k + M], B[k:k + M, jj])
if np.random.random(size=1)>pL1:
total_time += Lacc
accessmemL1 += 1
offloaded += 1
else:
latency = L2mem
accessmemL2 += 1
miss += 1
else:
# Fallback to manual multiplication with cache hierarchy if M does not fit
for kk in range(k, min(k + tile_size, N)):
# Check L1 cache hit
if ((ii - i) * tile_size + (kk - k)) < tile_elements or \
((kk - k) * tile_size + (jj - j)) < tile_elements:
if np.random.random(size=1)>pL1:
latency = L1mem
accessmemL1 +=1
else: # if the L1 data is not valid or not present then we need to seek the copy in L2
latency = L2mem
accessmemL2 += 1
# Check L2 cache hit if not in L1
elif ((ii - i) * tile_size + (kk - k)) < (L2_size // float_size) or \
((kk - k) * tile_size + (jj - j)) < (L2_size // float_size):
latency = L2mem
accessmemL2 += 1
# Access main memory if not in L1 or L2
else:
latency = Lmem
# Perform multiplication and accumulate latency
C[ii, jj] += A[ii, kk] * B[kk, jj]
total_time += latency
return total_time, accessmemL1, accessmemL2, op, offloaded, miss
# Run simulation for different matrix sizes
matrix_sizes = [4,16,32,64, 128, 256, 512] # Test for different values of N
results = {}
L1access = {}
L2access = {}
offL = {}
Miss = {}
for N in matrix_sizes:
start_time = time.time()
total_time, accessmemL1, accessmemL2, op, offloaded, missed = compute_time_with_tiling_and_accelerator(N)
end_time = time.time()
results[N] = total_time
L1access[N] = accessmemL1
L2access[N] = accessmemL2
offL[N] = offloaded
Miss[N] = missed
print(f"Matrix Size: {N}x{N}, Computation Time with Tiling and Accelerator (cycles): {total_time}, Elapsed Time: {end_time - start_time:.4f} seconds")
print(f"Matrix Size: {N}x{N}, memory access L1: {accessmemL1}, memory access L2: {accessmemL2}, number of op: {op}, offloaded: {offloaded}")
print(f"Matrix Size: {N}x{N}, computing intensity:{op/total_time}, missed access: {missed}")
plt.figure(1)
plt.loglog(list(results.keys()),list(results.values()),'--o')
plt.show(block=False)
plt.xlabel('Matrix Size [NxN]')
plt.ylabel('Number of Cycles [#]')
print("----------- STATISITCS -----------")
NBstat = 100
data = []
mm = []
for ii in range(NBstat):
total_time, accessmemL1, accessmemL2, op, offloaded, missed = compute_time_with_tiling_and_accelerator(128)
data.append(total_time)
mm.append(missed)
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(9, 4))
axs[0][0].violinplot(data,
showmeans=True,
showmedians=True)
axs[0][0].set_title('Compute Time [cycles]')
# plot box plot
axs[0][1].boxplot(data)
axs[0][1].set_title('Box plot')
axs[1][0].violinplot(mm,
showmeans=True,
showmedians=True)
axs[1][0].set_title('Miss number')
# plot box plot
axs[1][1].boxplot(mm)
axs[1][1].set_title('Box plot')
plt.show()
ResultTotal = {**parameters , **results }
ResultL1access = {**parameters , **L1access }
ResultL2access = {**parameters , **L2access }
ResultOFLOADED = {**parameters , **offL }
REsultMiss = {**parameters , **Miss}
rndnb = np.random.random()
FILETOTAL = f'total_{int(rndnb*10000)}.json' # change accordingly
FILEL1= f'L1_{int(rndnb*10000)}.json' # change accordingly
FILEL2= f'L2_{int(rndnb*10000)}.json' # change accordingly
FILE_OFFLOAD = f'Offload_{int(rndnb*10000)}.json' # change accordingly
FILE_MISS = f'Miss_{int(rndnb*10000)}.json'
PATHTOTAL= os.path.join('C:\\Users\\heltz\\Documents\\Research\\', FILETOTAL) #Adjust path accordingly
PATHL1= os.path.join('C:\\Users\\heltz\\Documents\\Research\\', FILEL1) #Adjust path accordingly
PATHL2= os.path.join('C:\\Users\\heltz\\Documents\\Research\\', FILEL2) #Adjust path accordingly
PATHOFFLOAD= os.path.join('C:\\Users\\heltz\\Documents\\Research\\', FILE_OFFLOAD) #Adjust path accordingly
PATHMISS= os.path.join('C:\\Users\\heltz\\Documents\\Research\\', FILE_MISS) #Adjust path accordingly
dict_to_json_file(ResultTotal, filename= PATHTOTAL)
dict_to_json_file(ResultL1access, filename= PATHL1)
dict_to_json_file(ResultL2access, filename= PATHL2)
dict_to_json_file(ResultOFLOADED, filename= PATHOFFLOAD)
dict_to_json_file(REsultMiss, filename= PATHMISS)