|
| 1 | +#!/usr/bin/python2.7 |
| 2 | + |
| 3 | +# public library |
| 4 | +import math |
| 5 | +import numpy as np |
| 6 | + |
| 7 | +class MultiLayerPerceptron(object): |
| 8 | + """docstring for MultiLayerPerceptron""" |
| 9 | + # info for systolic array |
| 10 | + A = None # systolic array dimension |
| 11 | + |
| 12 | + # memory bandwith number of bytes can be transferred. |
| 13 | + B = None |
| 14 | + |
| 15 | + # on-chip buffer size |
| 16 | + buf_size = None |
| 17 | + |
| 18 | + # input layer dimension |
| 19 | + N = None # numbers of feature (NumberOfPoints x NumberOfFeature) |
| 20 | + Ci = None # channels for ifmap |
| 21 | + Co = None # channels for ofmap |
| 22 | + |
| 23 | + # on-chip buffer size |
| 24 | + bufi_size = None |
| 25 | + bufo_size = None |
| 26 | + bufw_size = None |
| 27 | + |
| 28 | + """docstring for MultiLayerPerceptron""" |
| 29 | + def __init__(self, data, sys_info): |
| 30 | + self.data = data |
| 31 | + self.sys_info = sys_info |
| 32 | + self.A = sys_info["sa_size"] |
| 33 | + self.B = sys_info["memory_bandwidth"]/(sys_info["bit_width"]/8) |
| 34 | + self.buf_size = sys_info["bufsize"] |
| 35 | + |
| 36 | + def init_setup(self): |
| 37 | + layer_info = self.data |
| 38 | + |
| 39 | + # set up the new layer information |
| 40 | + [self.N, self.Ci] = layer_info["ifmap"] |
| 41 | + self.Co = layer_info["out_channel"] |
| 42 | + |
| 43 | + self.bufw_size = self.Co * self.Ci |
| 44 | + |
| 45 | + ############################################################### |
| 46 | + # general process # |
| 47 | + ############################################################### |
| 48 | + |
| 49 | + # compute buffer utilization |
| 50 | + def buffer_utilization(self, x): |
| 51 | + # buffer = ofmap + weights + ifmap |
| 52 | + return (x*self.Co + self.Ci*self.Co + x*self.Ci) |
| 53 | + |
| 54 | + # (ofmap + ifmap)*total_batch + (ofmap+weights)*Co/c_0 |
| 55 | + def data_transfer(self, x): |
| 56 | + # calculate the total batch |
| 57 | + total_batch = math.ceil(float(self.N) / x) |
| 58 | + |
| 59 | + # ofmap, ifmap and kernel tile size |
| 60 | + ofmap_tile_size = self.Co * x |
| 61 | + ifmap_tile_size = self.Ci * x |
| 62 | + kernel_tile_size = self.Co*self.Ci |
| 63 | + |
| 64 | + # ofmap + ifmap transfer |
| 65 | + total_transfer = (ofmap_tile_size + ifmap_tile_size) * total_batch |
| 66 | + |
| 67 | + # add additional data transfer |
| 68 | + total_transfer += kernel_tile_size |
| 69 | + |
| 70 | + return total_transfer |
| 71 | + |
| 72 | + def systolic_array_utilization(self, x): |
| 73 | + A = self.A |
| 74 | + A_w_uiti = math.ceil(self.Co/math.ceil(float(self.Co)/A)) |
| 75 | + |
| 76 | + total_usage = x * self.Co |
| 77 | + round_up_val = math.ceil(float(x/A))*A \ |
| 78 | + * math.ceil(float(self.Co)/A)*A |
| 79 | + |
| 80 | + # the pct of extra delay due to output-stationary |
| 81 | + delay_pct = float(self.Ci)/(self.Ci+A_w_uiti) |
| 82 | + |
| 83 | + return delay_pct * total_usage / round_up_val |
| 84 | + |
| 85 | + def compute_bound_cycle(self, util_rate): |
| 86 | + # total number of ops |
| 87 | + total_computation = (self.N*self.Ci*self.Co) |
| 88 | + |
| 89 | + # systolic array calculation capacity |
| 90 | + comp_cap = (self.A*self.A) * util_rate |
| 91 | + |
| 92 | + return total_computation / comp_cap |
| 93 | + |
| 94 | + def process_parameter(self, x): |
| 95 | + |
| 96 | + x = math.floor(x) |
| 97 | + bound = "C" |
| 98 | + # make the tile size even for every batch |
| 99 | + x_0 = min(self.N/math.ceil(self.N/round(x)), self.N) |
| 100 | + |
| 101 | + # (ofmap + ifmap)*total_batch + weights |
| 102 | + total_transfer = self.data_transfer(x_0) |
| 103 | + |
| 104 | + # compute the utilization of systolic array |
| 105 | + util_sys_arr = self.systolic_array_utilization(x_0) |
| 106 | + |
| 107 | + # compute the utilization of buffer |
| 108 | + util_buf = float(self.buffer_utilization(x_0))/self.buf_size |
| 109 | + |
| 110 | + if util_buf > 1.01: |
| 111 | + print("ERROR: the utilization of buffer is over 100%") |
| 112 | + exit() |
| 113 | + |
| 114 | + # calculate the amount of cycles of computing all elements. |
| 115 | + if self.compute_bound_cycle(util_sys_arr) > total_transfer/self.B: |
| 116 | + bound = "C" |
| 117 | + total_cycle = self.compute_bound_cycle(util_sys_arr) |
| 118 | + else: |
| 119 | + bound = "M" |
| 120 | + total_cycle = total_transfer/self.B |
| 121 | + |
| 122 | + ret = { |
| 123 | + "total_transfer": round(total_transfer), |
| 124 | + "total_cycle": round(total_cycle), |
| 125 | + "systolic_array_utilization": util_sys_arr, |
| 126 | + "buffer_utilization": util_buf, |
| 127 | + "x_0": x_0, |
| 128 | + "Bound" : bound |
| 129 | + } |
| 130 | + |
| 131 | + return ret |
| 132 | + |
| 133 | + # optimize one layer |
| 134 | + def optimize(self): |
| 135 | + self.init_setup() |
| 136 | + |
| 137 | + # if sum of bufi and bufw is over the self.buf_size |
| 138 | + # we should skip it. |
| 139 | + if self.bufw_size > self.buf_size: |
| 140 | + print("FAIL: the entire weight cannot be stored in buffer") |
| 141 | + exit() |
| 142 | + |
| 143 | + self.bufi_size = (self.buf_size - self.bufw_size)*self.Ci/(self.Ci+self.Co) |
| 144 | + self.bufo_size = (self.buf_size - self.bufw_size)*self.Co/(self.Ci+self.Co) |
| 145 | + |
| 146 | + # set the initial guess; |
| 147 | + x0 = self.A |
| 148 | + |
| 149 | + # let's see what percentage of ifmap can we fit into the buffer. |
| 150 | + while x0 < self.N and (x0+self.A)*self.Ci < self.bufi_size: |
| 151 | + x0 = x0 + self.A |
| 152 | + |
| 153 | + return self.process_parameter(x0) |
0 commit comments