00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 """Offlaod loop to SPE"""
00033
00034 from ctrump.optimizer import *
00035 from ctrump.Builder import *
00036 import ctrump
00037 import ctrump.spe
00038
00039 EI = ctrump.ExitInfo
00040
00041 ALIGN_CHECK_NONE = 0
00042 ALIGN_CHECK_16 = 1
00043 ALIGN_CHECK_128 = 2
00044
00045 SPE_LIBSPE2 = 0
00046 SPE_LIBSPE1 = 1
00047
00048 MFC_MAX=16384
00049
00050 import ctrump.libspe2
00051
00052 spe_runtime_table = [
00053 ctrump.libspe2
00054 ]
00055
00056 SELF_TRANSLATOR_NAME = 'SPE Offloading'
00057
00058 global_options_table = [
00059 ('address_mode', 'アドレスモード',
00060 '''64bitアドレスか32bitアドレスかを指定します''', [('32',32), ('64',64)]),
00061
00062 ('max_spe_num', 'SPEの最大数',
00063 '''SPEの最大数を指定します。
00064 パラメータを入れる配列のサイズを指定するのに使います
00065 ''',
00066 6)
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085 ]
00086
00087 translate_options_table = [
00088 ('buf_size', "ローカルバッファのサイズ",
00089 '''使うローカルバッファのサイズをバイト単位で指定します。
00090 小さすぎると正しく動作しません''', 16384),
00091 ('doublebuffering', 'ダブルバッファリングする',
00092 '''DMAのダブルバッファリングを行います。
00093 メモリレイテンシを隠蔽できますが、一度に使えるローカルバッファのサイズが小さくなります''', True),
00094 ('get_align_128', 'GETを128byte境界に揃える',
00095 '''バッファを少し多めに使い、転送が128byte境界に揃うようにします(無効時は16に揃えます)
00096 転送速度が速くなることがありますが、一度に使えるローカルバッファのサイズが小さくなります''', True),
00097
00098 ('name_of_spe_program', 'プログラムオブジェクトの名前',
00099 '''プログラムオブジェクトの名前を指定します''',
00100 'spe_main'),
00101
00102 ('parameter_var_name', 'パラメータ変数の名前',
00103 '''SPEに渡すパラメータの変数の名前を指定します''',
00104 'spe_param'),
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116 ]
00117
00118 class EmitEnv:
00119 def __init__(self, abi, ea_type, aligned, ignore_unaligned_data, double_buffering):
00120 self.struct_list = []
00121 self.struct_pending = []
00122 self.variables = {}
00123 self.structs = {}
00124 self.all_block_iv = []
00125 self.all_scalar_iv = []
00126 self.abi = abi
00127 self.ea_type = ea_type
00128 self.align_size = aligned
00129 self.ignore_unalign = ignore_unaligned_data
00130 self.double_buffering = double_buffering
00131
00132 global_option = Option(global_options_table)
00133 translate_option = Option(translate_options_table)
00134
00135 def align_ceil(v, align):
00136 v = v+(align-1)
00137 mod = v%align
00138 return v-mod
00139
00140 def align_mask_ceil(v, align, b):
00141 return b.paren(v+(align-1))&b.bcmpl(align-1)
00142
00143 def align_floor(v, align):
00144 mod = v%align
00145 return v-mod
00146
00147 def gen_ceil_div(l,r):
00148 return (l + r - 1) / r
00149
00150 def mul_if_not1(x,y):
00151 if isinstance(y,int):
00152 if y == 1:
00153 return x
00154 if y == 0:
00155 return 0
00156 return x * y
00157 if isinstance(x,int):
00158 if x == 0:
00159 return 0
00160 if x == 1:
00161 return y
00162 return y * x
00163 return x * y
00164
00165 def plus_if_not0(x,y):
00166 if isinstance(y,int):
00167 if y == 0:
00168 return x
00169 if y < 0:
00170 return x - -y
00171 if isinstance(x,int):
00172 if x == 0:
00173 return y
00174 return y+x
00175 return x+y
00176
00177 def minus_if_not0(x,y):
00178 if isinstance(y,int):
00179 if y == 0:
00180 return x
00181 if y < 0:
00182 return x + -y
00183 if isinstance(x,int):
00184 if x == 0:
00185 return -y
00186 return -y+x
00187 return x-y
00188
00189 def convert_cond(builder, exit_info, bound_expr):
00190 return builder.binary(exit_info.cmp_op,
00191 builder.varref(exit_info.inductive), bound_expr)
00192
00193 def pointer_add(b, base, add):
00194 return b.cast(b.pointer_to(b.void), b.cast(b.uint,base) + add)
00195
00196 def append_typedefs(texpr, table):
00197 if (texpr.code == ctrump.TYPE_TYPEDEF_NAME or
00198 texpr.code == ctrump.TYPE_STRUCT):
00199 table[texpr] = True
00200 elif texpr.code == ctrump.TYPE_QUALIFIED:
00201 append_typedefs(texpr.unqualified_type, table)
00202 elif texpr.code == ctrump.TYPE_POINTER:
00203 append_typedefs(texpr.pointer_to, table)
00204
00205 def spe_in_param_member(have_spe_output, param, name):
00206 if have_spe_output:
00207 return param.member('in').member(name)
00208 else:
00209 return param.member(name)
00210
00211
00212 def emit_unaligned_mfc(b, mod_var, len_var, align_size, mfc_func, dma_buffer, ea_type, ea_expr, dma_size, tag):
00213 return [
00214 b.assign(mod_var, b.band(b.paren(ea_expr), b.literal(align_size-1))),
00215 b.assign(len_var, align_mask_ceil(mod_var + dma_size, align_size, b)),
00216
00217 mfc_func(dma_buffer,
00218 b.binary(ctrump.EXPR_BIN_SUB, ea_expr, mod_var),
00219 len_var,
00220 tag, 0, 0),
00221 b.newline()
00222 ]
00223
00224 def emit_unaligned_mfc_mod(b, mod_var, len_var, align_size, mfc_func, dma_buffer, ea_type, ea_expr, dma_size, tag):
00225 return [
00226 b.assign(mod_var, b.band(b.cast(ea_type,ea_expr), b.literal(align_size-1))),
00227 b.assign(len_var, align_mask_ceil(mod_var + dma_size, align_size, b))]
00228
00229 def emit_unaligned_mfc_dma(b, mod_var, len_var, align_size, mfc_func, dma_buffer, ea_type, ea_expr, dma_size, tag):
00230 return [mfc_func(dma_buffer,
00231 b.binary(ctrump.EXPR_BIN_SUB, ea_expr, mod_var),
00232 len_var,
00233 tag, 0, 0),
00234 b.newline()]
00235
00236 def emit_unaligned_mfc_scalar_mod(b, mod_var, len_var, align_size, mfc_func, dma_buffer, ea_type, ea_expr, dma_size, tag):
00237 return [
00238 b.assign(mod_var, b.band(b.cast(ea_type,ea_expr), b.literal(align_size-1)))
00239 ]
00240 def emit_unaligned_mfc_scalar_dma(b, mod_var, len_var, align_size, mfc_func, dma_buffer, ea_type, ea_expr, dma_size, tag):
00241 return [mfc_func(dma_buffer,
00242 b.binary(ctrump.EXPR_BIN_SUB, ea_expr, mod_var),
00243 dma_size,
00244 tag, 0, 0),
00245 b.newline()]
00246
00247
00248
00249
00250
00251 def name_of_subscript(subscript):
00252 name = ''
00253
00254 if subscript.code == ctrump.LOOP_SUBSCRIPT_RECORD_MEMBER:
00255 name += '_%s'%subscript.member_name
00256 elif subscript.code == ctrump.LOOP_SUBSCRIPT_RECORD_MEMBER_TERMINAL:
00257 name += '_%s'%subscript.member_name
00258
00259 if len(subscript.indices):
00260 for i in subscript.indices:
00261 if i.code == ctrump.LOOP_INDEX_INDUCTIVE:
00262 name += '_%s'%(i.var.name)
00263 elif i.code == ctrump.LOOP_INDEX_INVARIANT:
00264 name += '_%s'%(i.var.name)
00265 elif i.code == ctrump.LOOP_INDEX_POINTER_INC:
00266 name += '_ptrinc'
00267
00268 if subscript.offset < 0:
00269 name += '_s%d'%-subscript.offset
00270 elif subscript.offset > 0:
00271 name += '_%d'%subscript.offset
00272 else:
00273 if subscript.offset < 0:
00274 name += '_s%d'%-subscript.offset
00275 elif subscript.offset > 0:
00276 name += '_%d'%subscript.offset
00277 else:
00278 name += '_0'
00279
00280 return name
00281
00282 def memop_var_name_suffix(op):
00283 if op.chain:
00284 suffix = memop_var_name_suffix(op.chain)
00285 else:
00286 suffix = ''
00287 for j in op.subscripts:
00288 suffix = '%s%s'%(suffix, name_of_subscript(j))
00289 return suffix
00290
00291 def sequential_test_0(ops, loop_nest_level, seq_map):
00292 for i in ops:
00293 last_iv_level = None
00294 pointer_inc = False
00295
00296 num_sub = len(i.subscripts)
00297
00298 for j in range(0,num_sub):
00299 for k in i.subscripts[j].indices:
00300 if k.code == ctrump.LOOP_INDEX_INDUCTIVE:
00301 last_iv_level = k.iv_level
00302 last_iv_index = j
00303 sequential_subscript = i.subscripts[j]
00304 incr = k.incr
00305 pointer_inc = False
00306
00307 elif k.code == ctrump.LOOP_INDEX_INVARIANT:
00308 pass
00309
00310 elif k.code == ctrump.LOOP_INDEX_POINTER_INC:
00311 last_iv_level = k.iv_level
00312 last_iv_index = j
00313 sequential_subscript = subscripts[j]
00314 incr = k.incr
00315 pointer_inc = True
00316
00317 if last_iv_level == None:
00318
00319 i.is_sequential = False
00320 continue
00321
00322 seq_map[i.loop_node.nest_level] = True
00323 i.is_sequential = (last_iv_level == loop_nest_level)
00324 i.sequential_subscript = sequential_subscript
00325 i.sequential_subscript_index = last_iv_index
00326 i.sequential_incr = incr
00327 i.is_pointer_inc = pointer_inc
00328
00329
00330 def sequential_test(memop_node, seq_map):
00331 loop_nest_level = memop_node.loop_node.nest_level
00332
00333 sequential_test_0(memop_node.ops, loop_nest_level, seq_map)
00334
00335 for i in memop_node.children:
00336 sequential_test(i, seq_map)
00337
00338
00339 def calc_memop_size_1(op, mem_size, emit_env):
00340 access_type = op.access_data_type()
00341 load_type = convert_ptr_to_ea(op.load_data_type(), emit_env.ea_type)
00342
00343 access_sz = ctrump.calc_type_size(emit_env.abi, access_type)
00344 load_sz = ctrump.calc_type_size(emit_env.abi, load_type)
00345
00346 if op.is_sequential:
00347 sz = load_sz * op.sequential_incr
00348 mem_size.seq_size += sz
00349 else:
00350 sz = access_sz
00351 mem_size.scalar_size += sz
00352
00353 op.access_data_size = access_sz
00354 op.load_data_size = load_sz
00355
00356 (range_min, range_max) = op.load_data_range()
00357
00358 if sz < mem_size.min_size:
00359 mem_size.min_size = sz
00360 if sz > mem_size.max_size:
00361 mem_size.max_size = sz
00362 mem_size.max_range = (range_max - range_min)*sz
00363
00364 mem_size.loop_unit_byte = lcm(mem_size.loop_unit_byte, sz)
00365 mem_size.num_buffer += 1
00366
00367 d = range_max - range_min
00368 mem_size.range_size += align_ceil(sz * d, emit_env.align_size)
00369
00370 def calc_memop_size_0(memop_node, mem_size, emit_env):
00371 for i in memop_node.ops:
00372 calc_memop_size_1(i, mem_size, emit_env)
00373
00374 for i in memop_node.children:
00375 calc_memop_size_0(i, mem_size, emit_env)
00376
00377 def calc_memop_size(memop_node, emit_env, buffer_size):
00378 class MemSize:
00379 def __init__(self, align_size):
00380 self.scalar_size = 0
00381 self.seq_size = 0
00382 self.num_buffer = 0
00383 self.min_size = MFC_MAX
00384 self.max_size = 0
00385 self.max_range = 0
00386 self.loop_unit_byte = align_size
00387 self.offset = 0
00388 self.scalar_offset = 0
00389 self.range_size = 0
00390 align_size = emit_env.align_size
00391 memsize = MemSize(align_size)
00392
00393 calc_memop_size_0(memop_node, memsize, emit_env)
00394
00395 sequential_buffer_size = buffer_size - align_ceil(memsize.scalar_size, align_size)
00396
00397 align_padding = align_size * 2
00398 padding_size = memsize.num_buffer * (align_padding + align_size) + memsize.range_size
00399
00400 if emit_env.double_buffering:
00401 available_buffer_size = align_floor(sequential_buffer_size/2 - padding_size, align_size)
00402 second_buffer_start = align_floor(sequential_buffer_size/2, align_size)
00403 memsize.second_buffer_start = second_buffer_start
00404 else:
00405 available_buffer_size = sequential_buffer_size - padding_size
00406 second_buffer_start = 0
00407
00408
00409
00410 have_seq_access = False
00411
00412 for i in memop_node.ops:
00413 if i.is_sequential:
00414 have_seq_access = True
00415
00416 loop_unit_byte = memsize.loop_unit_byte
00417 min_size = memsize.min_size
00418 max_size = memsize.max_size
00419 max_range = memsize.max_range
00420 seq_size = memsize.seq_size
00421
00422 if have_seq_access:
00423 spe_dispatch_unit = loop_unit_byte/min_size
00424 else:
00425 spe_dispatch_unit = 1
00426
00427 if seq_size == 0:
00428
00429 block_elem_count = 1
00430 spe_dispatch_unit = 1
00431 else:
00432 block_elem_count = available_buffer_size / seq_size
00433
00434 block_elem_count = align_floor(block_elem_count, spe_dispatch_unit)
00435
00436 if block_elem_count*max_size > (MFC_MAX-align_padding-max_range):
00437 block_elem_count = align_floor((MFC_MAX-align_padding-max_range)/max_size, spe_dispatch_unit)
00438
00439 memsize.block_elem_count = block_elem_count
00440 memsize.available_buffer_size = available_buffer_size
00441 memsize.second_buffer_start = second_buffer_start
00442 memsize.spe_dispatch_unit = spe_dispatch_unit
00443 memsize.align_padding = align_padding
00444
00445 return memsize
00446
00447 def convert_ptr_to_ea(ptr_type, ea_type):
00448 if ptr_type.code == ctrump.TYPE_POINTER:
00449 return convert_ptr_to_ea(ea_type, ea_type)
00450 return ptr_type
00451
00452 def get_loop_symbol(loop_node):
00453 (ind, reach, incr) = EI.get_loop_counter(loop_node.exit_info)
00454 return ind.name
00455
00456 def emit_decl_if_not_declared(decls, b, t, name, env):
00457 if not name in env:
00458 decls.append(b.decl(t, name))
00459
00460 def decl_subscript(b, decls, subscripts, env):
00461 for i in subscripts:
00462 for j in i.indices:
00463 if j.code == ctrump.LOOP_INDEX_INDUCTIVE:
00464 if not j.var.name in env:
00465 emit_decl_if_not_declared(decls, b, b.int_, j.var.name, env)
00466 env[j.var.name] = True
00467
00468 def decl_memop_buffer_ops(decls, b, ops, mem_size, loop_symbol,
00469 block_elem_count, emit_env):
00470 s = b.scope
00471
00472 ea_type = emit_env.ea_type
00473 abi = emit_env.abi
00474 variables = emit_env.variables
00475 align_size = emit_env.align_size
00476 double_buffering = emit_env.double_buffering
00477
00478 for i in ops:
00479 access_data_type = i.access_data_type()
00480 load_data_type = i.load_data_type()
00481 scalar_access_type = convert_ptr_to_ea(access_data_type, ea_type)
00482 scalar_elem_sz = ctrump.calc_type_size(abi, access_data_type)
00483 vector_access_type = convert_ptr_to_ea(load_data_type, ea_type)
00484 vector_elem_sz = ctrump.calc_type_size(abi, load_data_type)
00485 array_var = i.array.var
00486
00487 subscripts = i.subscripts
00488 seq_buffer = None
00489
00490 suffix = ''
00491
00492 decl_subscript(b, decls, subscripts, variables)
00493
00494 suffix = memop_var_name_suffix(i)
00495
00496 buffer_type = b.pointer_to(b.uchar)
00497 mem_size.offset += align_size
00498
00499 (range_min, range_max) = i.load_data_range()
00500 d = range_max - range_min
00501 range_size = vector_elem_sz * d
00502
00503 if i.is_sequential:
00504 buffer_init = b.cast(buffer_type, s.spe_buffer + mem_size.offset)
00505 elem_sz = vector_elem_sz
00506 sz = vector_elem_sz * i.sequential_incr
00507 mem_size.offset = align_ceil(mem_size.offset + sz, align_size)
00508 access_type = ctrump.spe.convert_type_ppe_to_spe(b, vector_access_type, emit_env.structs,
00509 emit_env.struct_pending, emit_env.struct_list,
00510 emit_env.abi)
00511
00512 mem_size.offset += align_ceil(block_elem_count*sz, align_size)
00513 else:
00514 buffer_init = b.cast(buffer_type, s.spe_buffer + mem_size.scalar_offset)
00515 elem_sz = scalar_elem_sz
00516 sz = scalar_elem_sz + range_size
00517 mem_size.scalar_offset = align_ceil(mem_size.scalar_offset + sz, align_size)
00518 access_type = ctrump.spe.convert_type_ppe_to_spe(b, scalar_access_type, emit_env.structs,
00519 emit_env.struct_pending, emit_env.struct_list,
00520 emit_env.abi)
00521 load_data_type = ctrump.spe.convert_type_ppe_to_spe(b, load_data_type, emit_env.structs,
00522 emit_env.struct_pending,
00523 emit_env.struct_list, emit_env.abi)
00524
00525 mem_size.scalar_offset = align_ceil(elem_sz, align_size)
00526
00527 mem_size.offset += align_ceil(range_size, align_size)
00528 loop_pointer_name = '%s%s'%(array_var.name, suffix)
00529
00530 if loop_pointer_name in variables:
00531 i.buffer_var = variables[loop_pointer_name]
00532 else:
00533 buffer_name = '%s%s_base'%(array_var.name,suffix)
00534 mod_var_name = '%s%s_mod'%(array_var.name,suffix)
00535 len_var_name = '%s%s_len'%(array_var.name,suffix)
00536
00537 if double_buffering:
00538 decls.append([
00539 b.decl(b.array_of(buffer_type,2), buffer_name),
00540 b.assign(s[buffer_name][0], buffer_init),
00541 b.assign(s[buffer_name][1], buffer_init + mem_size.secondary_offset),
00542 b.decl(b.array_of(b.int_,2), mod_var_name),
00543 b.decl(b.array_of(b.int_,2), len_var_name)])
00544 else:
00545 decls.append([
00546 b.decl(buffer_type, buffer_name,
00547 init=buffer_init),
00548 b.decl(b.int_, mod_var_name),
00549 b.decl(b.int_, len_var_name)])
00550
00551 decls.append(b.decl(b.pointer_to(load_data_type), loop_pointer_name))
00552 var = s[loop_pointer_name]
00553 i.loop_pointer = var
00554 i.buffer = s[buffer_name]
00555 i.mod_var = s[mod_var_name]
00556 i.len_var = s[len_var_name]
00557
00558 variables[loop_pointer_name] = i
00559
00560 i.buffer_var = i
00561
00562
00563 def decl_memop_buffer(decls, b, memop_node, mem_size, block_elem_count, emit_env):
00564 s = b.scope
00565 loop_symbol = get_loop_symbol(memop_node.loop_node)
00566
00567 decl_memop_buffer_ops(decls, b, memop_node.ops, mem_size, loop_symbol,
00568 block_elem_count, emit_env)
00569
00570 for i in memop_node.children:
00571 decl_memop_buffer(decls, b, i, mem_size, block_elem_count, emit_env)
00572
00573 def index_list_iv_level(indices):
00574 iv_level = None
00575 for i in indices:
00576 if (i.code == ctrump.LOOP_INDEX_INDUCTIVE or
00577 i.code == ctrump.LOOP_INDEX_POINTER_INC):
00578 iv_level = i.iv_level
00579
00580 return iv_level
00581
00582
00583 def subscript_offset(b, sub, byte_offset_expr, elem_offset_expr, is_vector, abi):
00584 if (sub.code == ctrump.LOOP_SUBSCRIPT_COEF_ARRAYSIZE or
00585 sub.code == ctrump.LOOP_SUBSCRIPT_COEF_CONSTANT):
00586 elem_offset_expr = mul_if_not1(sub.array_size_coef, elem_offset_expr)
00587
00588 elif sub.code == ctrump.LOOP_SUBSCRIPT_COEF_SCALE:
00589 elem_offset_expr = mul_if_not1(b.varref(sub.scale_array_coef), elem_offset_expr)
00590
00591 elif (sub.code == ctrump.LOOP_SUBSCRIPT_RECORD_MEMBER or
00592 sub.code == ctrump.LOOP_SUBSCRIPT_RECORD_MEMBER_TERMINAL):
00593 if not is_vector:
00594 field_offset = ctrump.struct_field_offset(abi, sub.record_type, sub.member_name)
00595 byte_offset_expr = plus_if_not0(field_offset, byte_offset_expr)
00596 elif sub.code == ctrump.LOOP_SUBSCRIPT_COEF_TERMINAL:
00597 pass
00598 elif sub.code == ctrump.LOOP_SUBSCRIPT_LOAD_RECORD:
00599 pass
00600 else:
00601 raise Exception('unkonwn subscript %s'%ctrump.loop_subscript_code_string_table[sub.code])
00602
00603 return (byte_offset_expr, elem_offset_expr)
00604
00605 def index_list_scalar_access(b, indices, start_offset, loop_top_level):
00606 s = b.scope
00607 offset_expr = 0
00608
00609 for i in indices:
00610 if i.code == ctrump.LOOP_INDEX_INDUCTIVE:
00611 if i.iv_level == loop_top_level:
00612 offset_expr = plus_if_not0(offset_expr,
00613 s[i.var.name] + start_offset)
00614 else:
00615 offset_expr = plus_if_not0(offset_expr, s[i.var.var.name])
00616
00617
00618 elif i.code == ctrump.LOOP_INDEX_INVARIANT:
00619 offset_expr = plus_if_not0(offset_expr, s[i.invariant.name])
00620
00621 elif i.code == ctrump.LOOP_INDEX_POINTER_INC:
00622
00623 pass
00624
00625 return offset_expr
00626
00627 def subscript_list_scalar_access(b, subscripts, start_offset, loop_top_level, abi):
00628 byte_offset_expr = 0
00629 elem_offset_expr = 0
00630 for j in subscripts:
00631 index_offset = index_list_scalar_access(b, j.indices, start_offset, loop_top_level)
00632 elem_offset_expr = plus_if_not0(index_offset, elem_offset_expr)
00633 elem_offset_expr = plus_if_not0(elem_offset_expr, j.offset)
00634
00635 (byte_offset_expr, elem_offset_expr) = subscript_offset(b, j, byte_offset_expr, elem_offset_expr, False, abi)
00636
00637 return (byte_offset_expr, elem_offset_expr)
00638
00639 def index_list_vector_access(b, indices, start_offset, loop_top_level, access_level, block_size,
00640 index_offset_map, is_first_get, seq_map):
00641 s = b.scope
00642 offset_expr = 0
00643 size_coef = 1
00644
00645 for i in indices:
00646 if i.code == ctrump.LOOP_INDEX_INDUCTIVE:
00647 if is_first_get and i.iv_level == access_level:
00648 incr = i.incr
00649 iv_level = i.iv_level
00650 size_coef = mul_if_not1(size_coef, incr)
00651
00652 if iv_level in seq_map:
00653 block_var_name = 'block_%s'%i.var.name
00654 index_offset = 0
00655 if iv_level in index_offset_map:
00656 index_offset = index_offset_map[iv_level]
00657
00658 if iv_level == access_level:
00659 if iv_level == loop_top_level:
00660 offset_expr = plus_if_not0(offset_expr,
00661 start_offset)
00662 else:
00663 pass
00664 else:
00665 if iv_level == loop_top_level:
00666 offset_expr = plus_if_not0(offset_expr,
00667 start_offset)
00668 else:
00669 pass
00670
00671 else:
00672 incr = i.incr
00673 iv_level = i.iv_level
00674 size_coef = mul_if_not1(size_coef, incr)
00675
00676 if iv_level in seq_map:
00677 block_var_name = 'block_%s'%i.var.name
00678 index_offset = 0
00679 if iv_level in index_offset_map:
00680 index_offset = index_offset_map[iv_level]
00681
00682 if iv_level == access_level:
00683 if iv_level == loop_top_level:
00684 offset_expr = plus_if_not0(offset_expr,
00685 plus_if_not0(s[block_var_name],index_offset)*block_size + start_offset)
00686 else:
00687 offset_expr = plus_if_not0(offset_expr,
00688 plus_if_not0(s[block_var_name],index_offset)*block_size)
00689 else:
00690 if iv_level == loop_top_level:
00691 offset_expr = plus_if_not0(offset_expr,
00692 s[i.var.name] + plus_if_not0(s[block_var_name],index_offset)*block_size + start_offset)
00693 else:
00694 offset_expr = plus_if_not0(offset_expr,
00695 s[i.var.name] + plus_if_not0(s[block_var_name],index_offset)*block_size)
00696
00697 else:
00698 if iv_level == loop_top_level:
00699 offset_expr = plus_if_not0(offset_expr,
00700 mul_if_not1(s[i.var.name], incr) + start_offset)
00701 else:
00702 offset_expr = plus_if_not0(offset_expr,
00703 mul_if_not1(s[i.var.name], incr))
00704
00705 elif i.code == ctrump.LOOP_INDEX_INVARIANT:
00706 offset_expr = plus_if_not0(offset_expr, s[i.invariant.name])
00707
00708 elif i.code == ctrump.LOOP_INDEX_POINTER_INC:
00709
00710 pass
00711
00712 return (offset_expr, size_coef)
00713
00714
00715 def subscript_list_vector_access(b, subscripts, start_offset, loop_top_level,
00716 access_level, range_min, range_max,
00717 block_size, index_offset_map, is_first_get,
00718 abi, seq_map):
00719 byte_offset_expr = 0
00720 elem_offset_expr = 0
00721 size_coef = 1
00722
00723 for j in subscripts:
00724 (index_offset, index_coef) = index_list_vector_access(b, j.indices, start_offset, loop_top_level,
00725 access_level, block_size, index_offset_map,
00726 is_first_get, seq_map)
00727 size_coef = mul_if_not1(size_coef, index_coef)
00728 elem_offset_expr = plus_if_not0(index_offset, elem_offset_expr)
00729
00730 (byte_offset_expr, elem_offset_expr) = subscript_offset(b, j, byte_offset_expr, elem_offset_expr, True, abi)
00731
00732 return (byte_offset_expr, elem_offset_expr, size_coef)
00733
00734 def byte_offset_ptr_ref(b, ret_type, var, offset):
00735 char_type = b.uchar
00736 return b.ptr_ref(b.cast(ret_type, b.cast(b.pointer_to(char_type), var) + offset))
00737
00738 def byte_offset_ptr_add(b, ret_type, var, offset):
00739 char_type = b.uchar
00740 return b.cast(ret_type, b.cast(b.pointer_to(char_type), var) + offset)
00741
00742
00743 def emit_memory_access(b, memop,
00744 align_size, mfc_ops, emit_env,
00745 start_offset, top_loop_nest_level, mem_access_emitted):
00746 scalar_dma_get_stmts = []
00747 scalar_dma_put_stmts = []
00748 scalar_buf_addr_set_stmts = []
00749 vector_load_ops = []
00750 vector_store_ops = []
00751 double_buffering = emit_env.double_buffering
00752 ea_type = emit_env.ea_type
00753 abi = emit_env.abi
00754 ignore_unalign = emit_env.ignore_unalign
00755
00756 s = b.scope
00757
00758 if memop in mem_access_emitted:
00759 return
00760
00761 subscripts = memop.subscripts
00762
00763 if memop.chain:
00764 emit_memory_access(b, memop.chain, align_size, mfc_ops, emit_env,
00765 start_offset, top_loop_nest_level, mem_access_emitted)
00766 ea_base_expr = memop.chain.scalar_access_expr
00767 else:
00768 ea_base_expr = s['ea_%s'%str(memop.array.var.name)]
00769
00770 last_iv_level = None
00771
00772 sub = subscripts[memop.load_data_sub_offset()]
00773
00774 load_type = convert_ptr_to_ea(sub.load_type, ea_type)
00775 load_size = b.calc_type_size(abi, load_type)
00776 aligned_load_size = align_ceil(load_size, align_size)
00777
00778 if memop.is_sequential:
00779 if memop.load_store == memop.LOAD:
00780 vector_load_ops.append((load_type, memop, ea_base_expr, subscripts))
00781 elif memop.load_store == memop.STORE:
00782 vector_store_ops.append((load_type, memop, ea_base_expr, subscripts))
00783
00784 ref_expr = build_array_op_ref_expr(b, memop, double_buffering, ignore_unalign, memop.offset_list)
00785 memop.scalar_access_expr = ref_expr
00786 else:
00787 (byte_offset,elem_offset) = subscript_list_scalar_access(b, subscripts,
00788 top_loop_nest_level, start_offset, abi)
00789
00790 var = memop.buffer_var.buffer
00791 mod_var = memop.buffer_var.mod_var
00792 len_var = memop.buffer_var.len_var
00793
00794 (range_min, range_max) = memop.load_data_range()
00795 offset = plus_if_not0(elem_offset, range_min)
00796
00797 size_expr = b.literal(load_size)
00798
00799 ea_expr = plus_if_not0(b.cast(b.puchar,ea_base_expr),
00800 plus_if_not0(mul_if_not1(size_expr,offset), byte_offset))
00801
00802 if double_buffering:
00803 scalar_dma_get_stmts.append([
00804 emit_unaligned_mfc_scalar_mod(b, mod_var[0], len_var[0], align_size,
00805 mfc_ops.get, var[0], ea_type,
00806 ea_expr,
00807 aligned_load_size, 0)
00808 ])
00809 scalar_buf_addr_set_stmts.append([
00810 b.assign(memop.buffer_var.loop_pointer, var[0] + mod_var[0])
00811 ])
00812 else:
00813 scalar_dma_get_stmts.append([
00814 emit_unaligned_mfc_scalar_mod(b, mod_var, len_var, align_size,
00815 mfc_ops.get, var, ea_type,
00816 ea_expr,
00817 aligned_load_size, 0)
00818 ])
00819 scalar_buf_addr_set_stmts.append([
00820 b.assign(memop.buffer_var.loop_pointer, var + mod_var)
00821 ])
00822
00823 if memop.load_store == memop.LOAD:
00824 s = scalar_dma_get_stmts
00825 op = mfc_ops.get
00826 dma_size = aligned_load_size
00827
00828 if double_buffering:
00829 local_buffer = var[0]
00830 s.append([
00831 emit_unaligned_mfc_scalar_dma(b, mod_var[0], len_var[0], align_size,
00832 op, local_buffer, ea_type,
00833 ea_expr,
00834 dma_size, 0)
00835 ])
00836 access_expr = var[0] + mod_var[0]
00837 else:
00838 local_buffer = var
00839 s.append([
00840 emit_unaligned_mfc_scalar_dma(b, mod_var, len_var, align_size,
00841 op, local_buffer, ea_type,
00842 ea_expr,
00843 dma_size, 0)
00844 ])
00845 access_expr = var + mod_var
00846
00847 else:
00848 s = scalar_dma_put_stmts
00849 op = mfc_ops.put_unaligned
00850 dma_size = load_size
00851
00852 local_buffer = memop.buffer_var.loop_pointer
00853
00854 s.append([
00855 op(local_buffer, ea_expr, load_size, 0, 0, 0)
00856 ])
00857
00858 if double_buffering:
00859 access_expr = var[0] + mod_var[0]
00860 else:
00861 access_expr = var + mod_var
00862
00863 s.append([
00864 mfc_ops.write_tag_mask(1),
00865 mfc_ops.read_tag_status_all()])
00866
00867 memop.scalar_access_expr = b.ptr_ref(b.cast(b.pointer_to(load_type), access_expr))
00868
00869 memop.scalar_dma_get_stmts = scalar_dma_get_stmts
00870 memop.scalar_dma_put_stmts = scalar_dma_put_stmts
00871 memop.scalar_buf_addr_set_stmts = scalar_buf_addr_set_stmts
00872 memop.vector_load_ops = vector_load_ops
00873 memop.vector_store_ops = vector_store_ops
00874
00875 mem_access_emitted[memop] = memop
00876
00877
00878
00879 def emit_buffer_declaration(b, memsize,
00880 buffer_name, buffer_size, memop_node, invariant_memops,
00881 emit_env):
00882 global_decls = []
00883 local_decls = []
00884
00885 all_buffer_type = b.array_of(b.uchar, buffer_size)
00886 buffer_decl = b.decl(all_buffer_type, buffer_name, stor_class=STOR_CLASS_STATIC, align=128)
00887
00888 global_decls.append(buffer_decl)
00889
00890 loop_unit_byte = memsize.loop_unit_byte
00891 min_size = memsize.min_size
00892 max_size = memsize.max_size
00893 max_range = memsize.max_range
00894 seq_size = memsize.seq_size
00895 block_elem_count = memsize.block_elem_count
00896 align_padding = memsize.align_padding
00897 spe_dispatch_unit = memsize.spe_dispatch_unit
00898
00899 if block_elem_count <= 0:
00900 raise Exception("too large data size")
00901
00902 decl_memop_buffer_ops(local_decls, b, invariant_memops, memsize, '', memsize.block_elem_count, emit_env)
00903 decl_memop_buffer(local_decls, b, memop_node, memsize, memsize.block_elem_count, emit_env)
00904
00905 return (global_decls, local_decls)
00906
00907 def loop_blocking(builder, loop_node, all_loop_count, loop_stmt, block_size, ops, emit_env, double_buffer_dma_get):
00908 double_buffering = emit_env.double_buffering
00909 b = builder
00910 s = b.scope
00911
00912 loop_symbol = get_loop_symbol(loop_node)
00913 exit_info = loop_node.exit_info
00914
00915 s = builder.scope
00916
00917 init = loop_stmt.init
00918 cond = loop_stmt.cond
00919 iter = loop_stmt.iter
00920 body = loop_stmt.body
00921
00922 before_block_stmts = []
00923
00924 if double_buffering:
00925 bufside = s['bufside_%s'%loop_symbol]
00926
00927 ptr_inc_var = PtrIncVarEnv()
00928 iv_list = []
00929
00930 for i in ops:
00931 iv_list += i.extract_iv(ptr_inc_var)
00932 emit_env.all_block_iv += iv_list
00933
00934 block_iv = []
00935 for incr in ptr_inc_var.incr_table:
00936 block_iv.append((builder.int_, 'block_incptr_%d'%incr, None))
00937
00938 iv_emitted = {}
00939 for iv in iv_list:
00940 if not iv.var in iv_emitted:
00941 block_iv.append((iv.var.type, 'block_%s'%str(iv.var.name), iv.var.name, iv.reach_at_loop_entry))
00942 iv_emitted[iv.var] = True
00943
00944 block_iv_decls = []
00945
00946 for (type,name,orig_name,loop_entry_definition) in block_iv:
00947 iv_var = s[orig_name]
00948
00949 block_initial_val = iv_var/block_size
00950
00951 entry_val = None
00952
00953 if loop_entry_definition and loop_entry_definition.code == ctrump.PDG_VALUE_EXPR:
00954 entry_val = ctrump.fold_const(loop_entry_definition.val)
00955 if None != entry_val:
00956 block_initial_val = iv_var/block_size
00957 else:
00958 block_initial_val = builder.div(iv_var, block_size)
00959
00960 if id(0) == id(entry_val):
00961 block_iv_decls.append(builder.decl(type, name, init=0))
00962 else:
00963 init_name = '%s_init'%name
00964 block_iv_decls.append(builder.decl(type, init_name, init=block_initial_val))
00965 block_iv_decls.append(builder.decl(type, name, init=s[init_name]))
00966
00967 empty_expr = builder.empty_expr()
00968
00969 block_for_init = empty_expr
00970
00971 (loop_counter, iv_loop_entry, loop_counter_incr) = EI.get_loop_counter(exit_info)
00972 loop_counter_name = loop_counter.name
00973
00974 num_block_count = gen_ceil_div(all_loop_count,block_size)
00975 block_counter_name = 'block_%s'%str(loop_counter_name)
00976 block_loop_num_name = 'block_count_%s'%str(loop_counter_name)
00977
00978 if not loop_counter in iv_emitted:
00979 block_iv_decls.append(builder.decl(loop_counter.type,
00980 block_counter_name,
00981 init=0))
00982
00983 iv_var = s[loop_counter.name]
00984 iv_block_var = s['block_%s'%loop_counter.name]
00985
00986 block_initial_val = iv_var/block_size
00987
00988 first_iv_init = None
00989 inner_for_init = empty_expr
00990
00991 if iv_loop_entry.code == ctrump.PDG_VALUE_EXPR:
00992 entry_val = ctrump.fold_const(iv_loop_entry.val)
00993 if (None != entry_val) and entry_val == 0:
00994 first_iv_init = []
00995 inner_for_init = b.assign(iv_var, 0)
00996 block_for_init = b.assign(iv_block_var, 0)
00997
00998 if first_iv_init == None:
00999 block_iv_decls.append(b.decl(iv_var.type, '%s_start'%iv_var.name, init=iv_var))
01000
01001 start_var = s['%s_start'%iv_var.name]
01002
01003 first_iv_init = [
01004 b.if_ (iv_block_var != s['block_%s_init'%loop_counter.name],
01005 b.assign(iv_var, 0)),
01006 b.assign(start_var, iv_var)
01007 ]
01008
01009 for i in ops:
01010 i.loop_counter_offset = start_var
01011 else:
01012 for i in ops:
01013 i.loop_counter_offset = None
01014
01015 block_iv_decls.append(builder.decl(loop_counter.type,
01016 block_loop_num_name,
01017 init=num_block_count))
01018
01019 block_for_cond = builder.binary(exit_info.cmp_op,
01020 s[block_counter_name],
01021 s[block_loop_num_name])
01022 block_for_iter = empty_expr
01023
01024 bound_var = EI.build_bound_var(builder, block_iv_decls,
01025 exit_info, 'loop_num', block_size)
01026
01027 if double_buffering:
01028 next_bound_var_name = '%s_next'%bound_var.name
01029 block_iv_decls.append(b.decl(b.int_, next_bound_var_name, init=0))
01030 next_bound_var = s[next_bound_var_name]
01031 calc_next_len = b.if_(s[block_counter_name] == s[block_loop_num_name]-2,
01032 b.comp(b.assign(next_bound_var, (all_loop_count-1)%block_size+1)))
01033 not_last_loop = b.comp(b.assign(next_bound_var, block_size),
01034 calc_next_len,
01035 double_buffer_dma_get)
01036 else:
01037 not_last_loop = None
01038
01039 block_head = [
01040 b.assign(bound_var, block_size),
01041 b.if_ (s[block_counter_name] == s[block_loop_num_name]-1,
01042 b.comp(b.assign(bound_var, (all_loop_count-1)%block_size+1)),
01043 else_body = not_last_loop),
01044 first_iv_init,
01045 b.newline(),
01046 ]
01047
01048 inner_cond = builder.binary(exit_info.cmp_op,
01049 builder.varref(exit_info.inductive),
01050 mul_if_not1(bound_var, loop_counter_incr))
01051
01052 block_iv_decls.append(b.newline())
01053
01054 block_foot = []
01055 for (type,name,orig_name,pdg) in block_iv:
01056 block_foot.append(b.postinc(s[name]))
01057
01058 if double_buffering:
01059 block_foot.append(b.assign(bufside , bufside^1))
01060
01061 return (block_iv_decls, loop_counter_name, bound_var, s[block_counter_name],
01062 before_block_stmts,
01063 block_for_init, block_for_cond, block_for_iter, block_head, block_foot,
01064 inner_for_init, inner_cond ,iter, body)
01065
01066 def offload_loop(b, loop_node, init, cond, iter, body, scalar_get_stmts, scalar_put_stmts):
01067 return b.for_(init, cond, iter, b.comp(scalar_get_stmts, body, scalar_put_stmts))
01068
01069 def build_array_ref(b, array_var, sub_sub, offset_list):
01070 expr = array_var
01071
01072 for (sub, (off,range_min,range_max)) in zip(sub_sub, offset_list):
01073 if isinstance(off,int):
01074 offset_int = off
01075 offset_tag = None
01076 else:
01077 offset_int = 0
01078 offset_tag = off
01079
01080 if sub.code == ctrump.LOOP_SUBSCRIPT_COEF_TERMINAL:
01081 sub_off = offset_int
01082 if len(sub.indices) != 0:
01083 sub_off = plus_if_not0(b.varref(sub.indices[0].var), sub_off)
01084 expr = expr[sub_off]
01085 elif (sub.code == ctrump.LOOP_SUBSCRIPT_COEF_ARRAYSIZE or
01086 sub.code == ctrump.LOOP_SUBSCRIPT_COEF_CONSTANT or
01087 sub.code == ctrump.LOOP_SUBSCRIPT_COEF_SCALE):
01088 pass
01089 elif sub.code == ctrump.LOOP_SUBSCRIPT_RECORD_MEMBER:
01090 expr = expr.member(sub.member_name)
01091 elif sub.code == ctrump.LOOP_SUBSCRIPT_RECORD_MEMBER_TERMINAL:
01092 if offset_tag:
01093 expr = expr.member(offset_tag)
01094 else:
01095 expr = expr.member(sub.member_name)
01096 elif sub.code == ctrump.LOOP_SUBSCRIPT_LOAD_RECORD:
01097 sub_off = offset_int
01098 if len(sub.indices) != 0:
01099 sub_off = plus_if_not0(b.varref(sub.indices[0].var), sub_off)
01100 expr = expr[sub_off]
01101 else:
01102 raise NotImplementedError('unknown subscript code "%s"'%ctrump.loop_subscript_code_string_table[sub.code])
01103
01104 return expr
01105
01106 def build_array_op_ref_expr(b, array_op, double_buffering, ignore_unalign, offset_list):
01107 if array_op.is_sequential:
01108 sub_sub = array_op.subscripts[array_op.sequential_subscript_index:]
01109 offset_list = offset_list[array_op.sequential_subscript_index:]
01110
01111 if array_op.sequential_subscript.indices[0].code == ctrump.LOOP_INDEX_INDUCTIVE:
01112 spe_array_var = array_op.buffer_var.loop_pointer
01113 new_expr = build_array_ref(b, spe_array_var, sub_sub, offset_list)
01114 else:
01115 raise Exception('sequential access with "%s"'%
01116 ctrump.loop_index_code_string_table[array_op.sequential_subscript.indices[0].code])
01117 else:
01118 sub_sub = array_op.subscripts
01119 spe_array_var = array_op.buffer_var.loop_pointer
01120 new_expr = build_array_ref(b, spe_array_var, sub_sub, offset_list)
01121
01122 return new_expr
01123
01124
01125 def replace_array_ref_expr(b, array_op, double_buffering, ignore_unalign):
01126 s = b.scope
01127
01128 for (offset,expr) in array_op.update_expr_list:
01129 new_expr = build_array_op_ref_expr(b, array_op, double_buffering, ignore_unalign, offset)
01130 if expr:
01131 expr.replace(b.build_expr(new_expr))
01132
01133
01134 def gather_buffer_name_1(ops, ptrs, emitted, ea_type):
01135 for op in ops:
01136 var = op.array.var
01137 if not var.name in emitted:
01138 original_texpr = var.type
01139 pointer_texpr = ctrump.type_apply_unary_pointer_conversion(original_texpr)
01140 ptrs.append((original_texpr, ea_type, var))
01141 emitted[var.name] = True
01142
01143 def gather_buffer_name_0(node, ptrs, emitted, ea_type):
01144 gather_buffer_name_1(node.ops, ptrs, emitted, ea_type)
01145 for i in node.children:
01146 gather_buffer_name_0(i, ptrs, emitted, ea_type)
01147
01148 def gather_buffer_name(memory_operation, ptrs, emitted, ea_type):
01149 gather_buffer_name_1(memory_operation.invariants, ptrs, emitted, ea_type)
01150 gather_buffer_name_0(memory_operation.tree, ptrs, emitted, ea_type)
01151
01152
01153 def emit_loop_node(b, mem_size, memop_node, emit_env,
01154 mfc_ops, all_loop_count, start_offset, top_loop_nest_level, mem_access_emitted, seq_map):
01155 ea_type = emit_env.ea_type
01156 double_buffering = emit_env.double_buffering
01157 ignore_unalign = emit_env.ignore_unalign
01158 align_size = emit_env.align_size
01159 abi = emit_env.abi
01160 ret_stmts = []
01161 loop_node = memop_node.loop_node
01162 exit_info = loop_node.exit_info
01163 loop_cfg_info = loop_node.cfg_info
01164 loop_stmt = loop_cfg_info.loop_stmt
01165
01166 if not all_loop_count:
01167 all_loop_count = EI.build_bound_expr(b, exit_info)
01168
01169 ops = memop_node.ops
01170
01171 loop_symbol = get_loop_symbol(loop_node)
01172
01173 scalar_get_stmts = []
01174 scalar_put_stmts = []
01175 scalar_buf_addr_set_stmts = []
01176 vector_load_ops = []
01177 vector_store_ops = []
01178
01179 if not loop_stmt.code == ctrump.STMT_FOR:
01180 raise ('not "for" loop')
01181
01182 for i in ops:
01183 emit_memory_access(b, i,
01184 align_size, mfc_ops, emit_env,
01185 start_offset, top_loop_nest_level, mem_access_emitted)
01186
01187 for i in ops:
01188 scalar_get_stmts += i.scalar_dma_get_stmts
01189 scalar_put_stmts += i.scalar_dma_put_stmts
01190 scalar_buf_addr_set_stmts += i.scalar_buf_addr_set_stmts
01191
01192 for i in ops:
01193 vector_load_ops += i.vector_load_ops
01194 vector_store_ops += i.vector_store_ops
01195
01196 (loop_counter, iv_loop_entry, loop_counter_incr) = EI.get_loop_counter(exit_info)
01197
01198 s = b.scope
01199 iv_var = s[loop_counter.name]
01200
01201 first_iv_init = None
01202 if iv_loop_entry.code == ctrump.PDG_VALUE_EXPR:
01203 entry_val = ctrump.fold_const(iv_loop_entry.val)
01204 if (None != entry_val) and entry_val == 0:
01205 first_iv_init = [b.assign(iv_var, 0) ]
01206
01207 if first_iv_init == None:
01208 first_iv_init = [
01209 b.if_(iv_var < s.spe_param.member('loop_count')*s.spe_id,
01210 b.assign(iv_var , 0),
01211 b.assign(iv_var, iv_var - s.spe_param.member('loop_count')*s.spe_id))
01212 ]
01213
01214 if (len(vector_load_ops) != 0) or (len(vector_store_ops) != 0):
01215
01216
01217 block_size = mem_size.block_elem_count
01218
01219 bufside_decls = []
01220
01221 if double_buffering:
01222 bufside_decls.append(b.decl(b.int_, 'bufside_%s'%loop_symbol, init=0))
01223 bufside = b.scope['bufside_%s'%loop_symbol]
01224
01225 double_buffer_dma_get = []
01226
01227 (decls, loop_symbol, loop_bound_var, block_ind_var,
01228 before_block_stmts,
01229 block_init, block_cond, block_iter, block_body, block_foot,
01230 inner_init, inner_cond, inner_iter, inner_body) = loop_blocking(b, loop_node, all_loop_count,
01231 loop_stmt, block_size, ops, emit_env,
01232 double_buffer_dma_get)
01233
01234 inner_loop = offload_loop(b, loop_node, inner_init, inner_cond, inner_iter, inner_body,
01235 [scalar_get_stmts, scalar_buf_addr_set_stmts], scalar_put_stmts)
01236
01237 decls.append(bufside_decls)
01238
01239 dma_get_stmts = []
01240 mfc_mod_stmt_list = []
01241 dma_put_stmts = []
01242 double_buffer_first_get = []
01243
01244 adjust_buffer_stmt = []
01245
01246 get_emitted = {}
01247 put_emitted = {}
01248 adjust_emitted = {}
01249
01250 index_offset_map = {}
01251 index_offset_map_first_get = {}
01252
01253 if double_buffering:
01254 index_offset_map[memop_node.loop_node.nest_level] = 1
01255
01256 def vector_dma_op(op):
01257 (load_type, memop, ea_expr, offset_subscripts) = op
01258 (range_min, range_max) = memop.load_data_range()
01259 d = range_max - range_min
01260
01261 ae = plus_if_not0(ea_expr, range_min*memop.load_data_size)
01262
01263 (block_byte_offset, block_elem_offset, size_coef) = subscript_list_vector_access(b,
01264 offset_subscripts,
01265 start_offset,
01266 top_loop_nest_level,
01267 memop.loop_node.nest_level,
01268 range_min,
01269 range_max,
01270 mem_size.block_elem_count,
01271 {},
01272 False,
01273 abi,
01274 seq_map)
01275
01276 (first_block_byte_offset, first_block_elem_offset, size_coef) = subscript_list_vector_access(b,
01277 offset_subscripts,
01278 start_offset,
01279 top_loop_nest_level,
01280 memop.loop_node.nest_level,
01281 range_min,
01282 range_max,
01283 mem_size.block_elem_count,
01284 {},
01285 True,
01286 abi,
01287 seq_map)
01288
01289 (block_byte_offset_get, block_elem_offset_get, size_coef) = subscript_list_vector_access(b,
01290 offset_subscripts,
01291 start_offset,
01292 top_loop_nest_level,
01293 memop.loop_node.nest_level,
01294 range_min,
01295 range_max,
01296 mem_size.block_elem_count,
01297 index_offset_map,
01298 False,
01299 abi,
01300 seq_map)
01301
01302 loop_pointer = memop.buffer_var.loop_pointer
01303 buffer = memop.buffer_var.buffer
01304
01305 is_partial_store = memop.is_partial_store()
01306
01307 memop_size = b.literal(memop.load_data_size)
01308 block_off_put = mul_if_not1(plus_if_not0(memop_size*(block_elem_offset), block_byte_offset), size_coef)
01309 block_off_first = mul_if_not1(plus_if_not0(memop_size*first_block_elem_offset, first_block_byte_offset), size_coef)
01310
01311 put_off = 0
01312
01313 if memop.loop_counter_offset:
01314 put_off = memop_size*memop.loop_counter_offset
01315
01316 if double_buffering:
01317 block_off_get = mul_if_not1(plus_if_not0(memop_size*(block_elem_offset_get), block_byte_offset_get), size_coef)
01318 next_loop_bound_var = s['%s_next'%loop_bound_var.name]
01319 next_size = plus_if_not0(next_loop_bound_var, d) * memop.load_data_size
01320 next_size = mul_if_not1(next_size, size_coef)
01321 else:
01322 block_off_get = block_off_put
01323
01324 current_size = plus_if_not0(loop_bound_var, d) * memop.load_data_size
01325 current_size = mul_if_not1(current_size, size_coef)
01326
01327 if memop.load_store == memop.LOAD:
01328 mfc_func = mfc_ops.get
01329
01330 if double_buffering:
01331 mfc_stmt_list = double_buffer_dma_get
01332 bufside_expr_get = bufside^1
01333 bufside_expr_dma = bufside^1
01334 block_off_get = block_off_get
01335 block_off_dma = block_off_get
01336 get_size = next_size
01337 dma_size = next_size
01338 dma_off = 0
01339 else:
01340 mfc_stmt_list = dma_get_stmts
01341 block_off_dma = block_off_get
01342 get_size = current_size
01343 dma_size = current_size
01344 dma_off = 0
01345
01346 mfc_emitted = get_emitted
01347 put=False
01348 else:
01349 mfc_func = mfc_ops.put_unaligned
01350 mfc_stmt_list = dma_put_stmts
01351 mfc_emitted = put_emitted
01352 if double_buffering:
01353 bufside_expr_get = bufside^1
01354 bufside_expr_dma = bufside
01355 block_off_get = block_off_get
01356 block_off_dma = block_off_put
01357 get_size = next_size
01358 dma_size = current_size
01359 dma_off = put_off
01360 else:
01361 block_off_dma = block_off_get
01362 get_size = current_size
01363 dma_size = current_size
01364 dma_off = put_off
01365
01366 put=True
01367 block_off = block_off_put
01368
01369 if double_buffering:
01370 tag_expr_dma = bufside_expr_dma+loop_node.nest_level*2
01371 tag_expr_get = bufside_expr_get+loop_node.nest_level*2
01372 else:
01373 tag_expr = 0
01374
01375 if not buffer in mfc_emitted:
01376 mfc_emitted[buffer] = True
01377 pin0 = plus_if_not0
01378 min0 = minus_if_not0
01379
01380 if double_buffering:
01381 if ignore_unalign:
01382 raise Exception('fixme:aligned-doublebuffering')
01383 mfc_stmt_list.append(mfc_func(buffer[bufside_expr_dma],
01384 ae + block_off, get_size,
01385 tag_expr, 0, 0))
01386 else:
01387 mod = memop.buffer_var.mod_var
01388 len_var = memop.buffer_var.len_var
01389
01390 double_buffer_first_get.append(emit_unaligned_mfc_mod(b, mod[0], len_var[0],
01391 align_size, mfc_ops,
01392 buffer[0],
01393 ea_type,
01394 pin0(ae,block_off_first),
01395 current_size,
01396 loop_node.nest_level*2))
01397
01398 if not put:
01399 double_buffer_first_get.append(emit_unaligned_mfc_dma(b, mod[0], len_var[0],
01400 align_size, mfc_func,
01401 buffer[0],
01402 ea_type,
01403 pin0(ae,block_off_first),
01404 current_size,
01405 loop_node.nest_level*2))
01406
01407 elif is_partial_store:
01408 double_buffer_first_get.append(emit_unaligned_mfc_dma(b, mod[0], len_var[0],
01409 align_size, mfc_ops.getf,
01410 buffer[0],
01411 ea_type,
01412 pin0(ae,block_off_first),
01413 current_size,
01414 loop_node.nest_level*2))
01415
01416 double_buffer_dma_get.append(emit_unaligned_mfc_mod(b, mod[bufside_expr_get], len_var[bufside_expr_get],
01417 align_size, mfc_ops,
01418 buffer[bufside_expr_get],
01419 ea_type,
01420 pin0(ae,block_off_get), get_size,
01421 tag_expr_get))
01422
01423 if is_partial_store:
01424 double_buffer_dma_get.append(emit_unaligned_mfc_dma(b, mod[bufside_expr_get], len_var[bufside_expr_get],
01425 align_size, mfc_ops.getf,
01426 buffer[bufside_expr_get],
01427 ea_type, ae + block_off_get, dma_size,
01428 tag_expr_get))
01429
01430
01431 if put:
01432 mfc_stmt_list.append(mfc_ops.put_unaligned(pin0(b.cast(b.puchar,loop_pointer), dma_off),
01433 pin0(pin0(ae, block_off_dma), dma_off),
01434 min0(dma_size,dma_off), tag_expr_dma, 0, 0))
01435 else:
01436 mfc_stmt_list.append(emit_unaligned_mfc_dma(b, mod[bufside_expr_dma], len_var[bufside_expr_dma],
01437 align_size, mfc_func,
01438 buffer[bufside_expr_dma],
01439 ea_type, ae + block_off_dma, dma_size,
01440 tag_expr_dma))
01441
01442
01443
01444 else:
01445 if ignore_unalign:
01446 mfc_stmt_list.append(mfc_func(buffer, ae + block_off_dma, get_size,
01447 0, 0, 0))
01448 else:
01449 mod = memop.buffer_var.mod_var
01450 len_var = memop.buffer_var.len_var
01451
01452 if is_partial_store:
01453 dma_get_stmts.append(emit_unaligned_mfc_mod(b, mod, len_var,
01454 align_size, mfc_func,
01455 buffer,
01456 ea_type, ae + block_off_dma, dma_size,
01457 tag_expr))
01458 dma_get_stmts.append(emit_unaligned_mfc_dma(b, mod, len_var,
01459 align_size, mfc_ops.getf,
01460 buffer,
01461 ea_type, ae + block_off_dma, dma_size,
01462 tag_expr))
01463 else:
01464 mfc_mod_stmt_list.append(emit_unaligned_mfc_mod(b, mod, len_var,
01465 align_size, mfc_func,
01466 buffer,
01467 ea_type, ae + block_off_dma, dma_size,
01468 tag_expr))
01469
01470 if put:
01471 mfc_stmt_list.append(mfc_ops.put_unaligned(pin0(b.cast(b.puchar,loop_pointer), dma_off),
01472 pin0(pin0(ae, block_off_dma), dma_off),
01473 min0(dma_size,dma_off), tag_expr, 0, 0))
01474 else:
01475 mfc_stmt_list.append(emit_unaligned_mfc_dma(b, mod, len_var,
01476 align_size, mfc_func,
01477 buffer,
01478 ea_type, ae + block_off_dma, dma_size,
01479 tag_expr))
01480
01481 if not buffer in adjust_emitted:
01482 pointer_type = b.pointer_to(load_type)
01483 adjust_emitted[buffer] = True
01484 (range_min, range_max) = memop.load_data_range()
01485 off_expr = -range_min
01486 if double_buffering:
01487 if ignore_unalign:
01488 adjust_buffer_stmt.append(b.assign(loop_pointer,
01489 plus_if_not0(buffer[bufside],off_expr)))
01490 else:
01491 mod = memop.buffer_var.mod_var
01492 buffer_addr = byte_offset_ptr_add(b, pointer_type, buffer[bufside], mod[bufside])
01493 adjust_buffer_stmt.append(b.assign(loop_pointer,
01494 plus_if_not0(buffer_addr, off_expr)))
01495 else:
01496 if ignore_unalign:
01497 adjust_buffer_stmt.append(b.assign(loop_pointer,
01498 plus_if_not0(buffer,off_expr)))
01499 else:
01500 mod = memop.buffer_var.mod_var
01501 buffer_addr = byte_offset_ptr_add(b, pointer_type, buffer, mod)
01502 adjust_buffer_stmt.append(b.assign(loop_pointer,
01503 plus_if_not0(buffer_addr, off_expr)))
01504
01505
01506
01507 for i in vector_load_ops:
01508 vector_dma_op(i)
01509 for i in vector_store_ops:
01510 vector_dma_op(i)
01511
01512 if double_buffering:
01513 tag_expr = b.binary(Binary.LSHIFT, 1, b.paren(plus_if_not0(bufside,loop_node.nest_level*2)))
01514 else:
01515 tag_expr = 1
01516 bufside_expr = 1
01517
01518 if (len(double_buffer_dma_get) + len(dma_get_stmts) + len(dma_put_stmts)) != 0:
01519 dma_get_stmts.append([mfc_ops.write_tag_mask(tag_expr),
01520 mfc_ops.read_tag_status_all()])
01521
01522 block_loop = [double_buffer_first_get,
01523 b.for_(block_init,
01524 block_cond,
01525 block_iter,
01526 b.comp(block_body, mfc_mod_stmt_list, dma_get_stmts,
01527 adjust_buffer_stmt, inner_loop, dma_put_stmts, block_foot)),
01528 mfc_ops.write_tag_mask(3),
01529 mfc_ops.read_tag_status_all()]
01530
01531 block_loop = [first_iv_init, before_block_stmts, decls, block_loop]
01532
01533 loop_stmt.replace(b.build_stmt(block_loop))
01534 else:
01535 cond = convert_cond(b, exit_info, all_loop_count)
01536
01537 body = loop_stmt.body
01538
01539
01540
01541 if all_loop_count:
01542 init = b.empty_expr()
01543 else:
01544 init = loop_stmt.init
01545
01546 inner_loop = offload_loop(b, loop_node, init, cond, loop_stmt.iter, body,
01547 scalar_get_stmts, scalar_put_stmts)
01548
01549 loop_stmt.replace(b.build_stmt([first_iv_init, inner_loop]))
01550
01551 for i in ops:
01552 emit_env.all_scalar_iv += i.extract_iv(ptr_inc_var)
01553
01554
01555 class IVWrapper:
01556 def __init__(self, var):
01557 self.var = var
01558 self.iv_level = loop_node.nest_level
01559 emit_env.all_scalar_iv.append(IVWrapper(loop_counter))
01560
01561 for i in memop_node.children:
01562 emit_loop_node(b, mem_size, i, emit_env, mfc_ops,
01563 None, start_offset, top_loop_nest_level, mem_access_emitted, seq_map)
01564
01565 for i in ops:
01566 replace_array_ref_expr(b, i, double_buffering, ignore_unalign)
01567
01568 return loop_stmt
01569
01570 def decl_var_list(b, vars, declared):
01571 ret = []
01572 for i in vars:
01573 name = i.var.name
01574 if i.var.type.code != ctrump.TYPE_FUNC:
01575 if not name in declared:
01576 type = i.var.type
01577 ret.append(b.decl(type, name))
01578 declared[name] = True
01579 return ret
01580
01581 def get_loop_blocks_0(bb, exit, emitted, blocks):
01582 if bb != exit and not bb in emitted:
01583 emitted[bb] = True
01584 blocks.append(bb)
01585
01586 for i in bb.succs:
01587 get_loop_blocks_0(i, exit, emitted, blocks)
01588
01589
01590 def get_loop_blocks(loop_cfg_info):
01591 exit = loop_cfg_info.break_bb
01592 emitted = {}
01593 blocks = []
01594
01595 get_loop_blocks_0(loop_cfg_info.cond_bb, exit, emitted, blocks)
01596 return blocks
01597
01598 class OffloadSpeTranslator(LoopTranslatorBase):
01599 def __init__(self):
01600 super(OffloadSpeTranslator,self).__init__()
01601
01602 def get_name(self):
01603 return SELF_TRANSLATOR_NAME
01604
01605 def get_global_option(self):
01606 return global_option
01607
01608 def get_translate_option(self):
01609 return translate_option
01610
01611 def translate(self, analyze_result, stmt, cfg, loop_info_node, global_options, translate_option, prog):
01612 translator_global_option = global_options.translators[SELF_TRANSLATOR_NAME]
01613
01614 program_obj_name = translate_option['name_of_spe_program']
01615 parameter_var_name = translate_option['parameter_var_name']
01616 buf_size = translate_option['buf_size']
01617 align_128 = translate_option['get_align_128']
01618 ignore_small_size = False
01619 ignore_unaligned_data = False
01620 double_buffering = translate_option['doublebuffering']
01621
01622 align_size = 16
01623 if align_128:
01624 align_size = 128
01625
01626 get_spe_num = 'ctrump_spe_num'
01627 get_spe_template = 'ctrump_runtime_spes[%s].spe_context'
01628 max_spe_num = translator_global_option['max_spe_num']
01629 address_mode = translator_global_option['address_mode']
01630
01631 spe_runtime_name = SPE_LIBSPE2
01632 spe_runtime_mod = spe_runtime_table[spe_runtime_name]
01633
01634 env_obj = stmt.env_obj
01635 b = Builder(env_obj)
01636 s = b.scope
01637
01638 spe_runtime = spe_runtime_mod.create(b)
01639
01640 li = loop_info_node.loop_node
01641 invariants_bmp = li.invariant_var
01642 invariants = bitmap_var_info(cfg.var_info, invariants_bmp)
01643
01644 if address_mode == 64:
01645 ea_type = b.ullong
01646 abi = ctrump.ppc64_abi
01647 else:
01648 ea_type = b.uint
01649 abi = ctrump.ppc32_abi
01650
01651 emit_env = EmitEnv(abi, ea_type, align_size, ignore_unaligned_data, double_buffering)
01652
01653 spe_param_in_member_list = [(ea_type, 'sync_buffer_ea'),
01654 (b.int_,'start'),
01655 (b.int_,'loop_count'),
01656 (b.int_,'all_loop_count'),
01657 (b.int_,'spe_id'),
01658 (b.int_,'is_last_spe')]
01659
01660 memory_operation = loop_tree_to_memop_tree(li)
01661 memory_operation.optimize()
01662 memop_tree = memory_operation.tree
01663
01664 load_store = []
01665
01666 invariants_and_incptrs = []
01667
01668 for i in invariants:
01669 var = i.var
01670 if var.name in emit_env.variables:
01671 continue
01672
01673 original_texpr = var.type
01674 pointer_texpr = ctrump.type_apply_unary_pointer_conversion(original_texpr)
01675 if pointer_texpr:
01676 texpr = ea_type
01677 else:
01678 texpr = original_texpr
01679 invariants_and_incptrs.append((original_texpr, texpr, var))
01680 emit_env.variables[var.name] = True
01681
01682 (loop_cond_ind, loop_iv_reach, loop_iv_incr) = EI.get_loop_counter(li.exit_info)
01683
01684 loop_init_stmt = b.b.stmt_expr(code=ctrump.STMT_EXPR, expr=stmt.init)
01685
01686 loop_iv_is_zero = False
01687
01688 if loop_iv_reach.code == ctrump.PDG_VALUE_EXPR:
01689 entry_val = ctrump.fold_const(loop_iv_reach.val)
01690 if (None != entry_val) and entry_val == 0:
01691 loop_iv_is_zero = True
01692
01693 if not loop_iv_is_zero:
01694 name = li.exit_info.inductive.name
01695 if not name in emit_env.variables:
01696 emit_env.variables[name] = True
01697 invariants_and_incptrs.append((b.int_, b.int_, li.exit_info.inductive))
01698
01699 gather_buffer_name(memory_operation, invariants_and_incptrs,
01700 emit_env.variables, ea_type)
01701
01702 for (orig_t, param_t, var) in invariants_and_incptrs:
01703 spe_param_in_member_list.append((param_t,var.name))
01704
01705 spe_param_out_member_list = []
01706
01707 for i in li.reductions:
01708 spe_param_out_member_list.append((i.var.type, i.var.name))
01709
01710 have_spe_output = (len(spe_param_out_member_list) != 0)
01711
01712 spe_param_decl = []
01713
01714 if have_spe_output:
01715 spe_param_in_typespec = b.struct('spe_param_in', spe_param_in_member_list, attr_align=align_size)
01716 spe_param_out_typespec = b.struct('spe_param_out', spe_param_out_member_list, attr_align=align_size)
01717 spe_param_type_spec = b.union('spe_param', [(spe_param_in_typespec,'in'),
01718 (spe_param_out_typespec,'out')], attr_align=align_size)
01719
01720 param_struct_decl = [
01721 b.struct_decl(spe_param_in_typespec),
01722 b.struct_decl(spe_param_out_typespec),
01723 b.struct_decl(spe_param_type_spec)]
01724 else:
01725 spe_param_type_spec = b.struct('spe_param', spe_param_in_member_list, attr_align=align_size)
01726 param_struct_decl = b.struct_decl(spe_param_type_spec)
01727
01728 param_typedef_decl = b.decl(spe_param_type_spec, 'spe_param_t', stor_class=ctrump.STOR_CLASS_TYPEDEF)
01729 param_type_size = b.calc_type_size(abi, spe_param_type_spec)*max_spe_num + align_size
01730 spe_param_array_type_spec = b.array_of(b.uchar, param_type_size)
01731 spe_param_ptr_type_spec = b.pointer_to(s.spe_param_t)
01732 spe_param_array_decl = b.decl(spe_param_array_type_spec, 'spe_param_buffer')
01733 param_mod_ea = b.decl(ea_type, 'param_ea_mod', init=b.band(b.cast(ea_type, s.spe_param_buffer) , (align_size-1)))
01734
01735 spe_param_ptr_decl = b.decl(spe_param_ptr_type_spec, parameter_var_name)
01736 align_param_addr = b.assign(s[parameter_var_name] ,
01737 b.cast(b.pointer_to(s.spe_param_t), s.spe_param_buffer + b.sub(align_size , s.param_ea_mod)))
01738
01739 sync_buffer_decl = b.decl(b.array_of(b.uchar, 256), 'sync_buffer')
01740 sync_buffer_ptr_decl = b.decl(b.pointer_to(b.uchar), 'sync_buffer_ptr')
01741 sync_buffer_ea_mod = b.decl(b.pointer_to(b.uchar), 'sync_buffer_ea_mod',
01742 init = b.cast(b.pointer_to(b.uchar),
01743 b.band(b.cast(ea_type, s.sync_buffer), 127)))
01744
01745 sync_buffer_ptr_align = b.assign(s.sync_buffer_ptr,
01746 b.cast(b.pointer_to(b.uchar), s.sync_buffer + b.sub(128, s.sync_buffer_ea_mod)))
01747
01748
01749 sync_buffer_init = b.assign(b.ptr_ref(b.cast(b.pointer_to(b.uint),
01750 s.sync_buffer_ptr)),
01751 0)
01752
01753 spe_param_decl = [
01754 param_struct_decl,
01755 param_typedef_decl,
01756 spe_param_ptr_decl,
01757 spe_param_array_decl,
01758 param_mod_ea,
01759 align_param_addr,
01760 sync_buffer_decl,
01761 sync_buffer_ea_mod,
01762 sync_buffer_ptr_decl,
01763 sync_buffer_ptr_align,
01764 sync_buffer_init
01765 ]
01766
01767 spe_program_typespec = spe_runtime.program_typespec()
01768 spe_obj_typespec = spe_runtime.speobj_typespec()
01769 emit_spe_program_load = spe_runtime.get_program_load_func()
01770 emit_run = spe_runtime.get_run_func()
01771 emit_wait = spe_runtime.get_wait_func()
01772 emit_spe_end = spe_runtime.get_spe_end_func()
01773
01774 program_decl = b.decl(spe_program_typespec, program_obj_name, stor_class=ctrump.STOR_CLASS_EXTERN)
01775 program_obj = s[program_obj_name]
01776
01777 get_spe = get_spe_template%'spe_id'
01778
01779 exit_info = li.exit_info
01780 loop_count_incr = exit_info.incr
01781 bound_expr = b.expr(exit_info.bound)
01782
01783 if loop_count_incr != 1:
01784 bound_expr = bound_expr+(loop_count_incr-1)
01785 all_loop_count_expr = bound_expr/loop_count_incr
01786 all_loop_count_decl = b.decl(b.int_, 'all_loop_count', init=all_loop_count_expr)
01787 all_loop_count = s.all_loop_count
01788 else:
01789 all_loop_count_expr = bound_expr
01790 all_loop_count_decl = None
01791 all_loop_count = bound_expr
01792
01793 align_padding = 32
01794 if align_128:
01795 align_padding = 256
01796
01797 seq_map = {}
01798 sequential_test(memop_tree, seq_map)
01799 sequential_test_0(memory_operation.invariants, -1, seq_map)
01800
01801 memsize = calc_memop_size(memop_tree, emit_env, buf_size)
01802
01803 for i in memory_operation.invariants:
01804 calc_memop_size_1(i, memsize, emit_env)
01805
01806 if double_buffering:
01807 memsize.secondary_offset = buf_size/2
01808
01809 spe_dispatch_unit = memsize.spe_dispatch_unit
01810 block_elem_count = memsize.block_elem_count
01811
01812 if block_elem_count <= 0:
01813 raise Exception("too large data size")
01814
01815 spe_num_decls = []
01816 spe_num_decls.append(b.decl(b.int_, 'spe_num', init=get_spe_num))
01817 spe_num_var = s.spe_num
01818 params_var = s[parameter_var_name]
01819
01820 init_param_stmts = []
01821 speid_decl = b.decl(b.int_, 'spe_id')
01822
01823 pvs = params_var[s.spe_id]
01824
01825 for (orig_t,param_t,var) in invariants_and_incptrs:
01826 name = var.name
01827 expr = b.varref(var)
01828
01829 if orig_t != param_t:
01830 expr = b.cast(param_t, expr)
01831
01832 init_param_stmts.append(b.assign(spe_in_param_member(have_spe_output, pvs, name),
01833 expr))
01834
01835
01836 spe_num_decls.append(b.decl(b.int_, 'all_units', init = gen_ceil_div(all_loop_count,spe_dispatch_unit)))
01837
01838 if not ignore_small_size:
01839 spe_num_decls.append(b.if_(s.all_units < s.spe_num,
01840 b.comp(b.assign(s.spe_num , s.all_units))))
01841
01842 unit_num_decl = b.decl(b.int_, 'per_spe_units', init = s.all_units/s.spe_num)
01843 per_spe_loop_count = mul_if_not1(s.per_spe_units, spe_dispatch_unit)
01844 per_spe_loop_count_decl = b.decl(b.int_, 'per_spe_loop_count', init=per_spe_loop_count)
01845
01846 spe_param_member_list = [(ea_type, 'sync_buffer_ea'),
01847 (b.int_,'start'),
01848 (b.int_,'loop_count')]
01849
01850 init_param_stmts.append(
01851 [b.assign(spe_in_param_member(have_spe_output, pvs, 'start'),
01852 s['spe_id'] * s.per_spe_loop_count),
01853 b.assign(spe_in_param_member(have_spe_output, pvs, 'loop_count'),s.per_spe_loop_count),
01854 b.assign(spe_in_param_member(have_spe_output, pvs, 'is_last_spe'),b.paren(s.spe_id == (s.spe_num-1))),
01855 b.assign(spe_in_param_member(have_spe_output, pvs, 'spe_id'),s.spe_id),
01856 b.assign(spe_in_param_member(have_spe_output, pvs, 'all_loop_count'),all_loop_count),
01857 b.assign(spe_in_param_member(have_spe_output, pvs, 'sync_buffer_ea'),b.cast(ea_type,s.sync_buffer_ptr))])
01858
01859 if have_spe_output:
01860 gather_reduction = [b.newline()]
01861 gather_loop_body = []
01862 for i in li.reductions:
01863 var = i.var
01864
01865 if i.op == ctrump.REDUCTIVE_ADD or i.op == ctrump.REDUCTIVE_FADD:
01866 op = ctrump.EXPR_BIN_ADD
01867 elif i.op == ctrump.REDUCTIVE_SUB or i.op == ctrump.REDUCTIVE_FSUB:
01868 op = ctrump.EXPR_BIN_ADD
01869 elif i.op == ctrump.REDUCTIVE_MUL or i.op == ctrump.REDUCTIVE_FMUL:
01870 op = ctrump.EXPR_BIN_MUL
01871 elif i.op == ctrump.REDUCTIVE_BOR:
01872 op = ctrump.EXPR_BIN_BOR
01873 elif i.op == ctrump.REDUCTIVE_BAND:
01874 op = ctrump.EXPR_BIN_BAND
01875 elif i.op == ctrump.REDUCTIVE_BXOR:
01876 op = ctrump.EXPR_BIN_BXOR
01877 elif i.op == ctrump.REDUCTIVE_LAND:
01878 op = ctrump.EXPR_BIN_LAND
01879 elif i.op == ctrump.REDUCTIVE_LXOR:
01880 op = ctrump.EXPR_BIN_LXOR
01881 else:
01882 raise 'invalid reduction'
01883
01884 gather_loop_body.append(b.assign(b.varref(var),
01885 b.binary(op, b.varref(var),
01886 params_var[s.spe_id].member('out').member(var.name))))
01887
01888 gather_loop = b.for_ (b.assign(s.spe_id , 0),
01889 s.spe_id < s.spe_num,
01890 b.postinc(s.spe_id),
01891 b.comp(gather_loop_body))
01892
01893 gather_reduction.append(gather_loop)
01894 gather_reduction.append(b.newline())
01895 else:
01896 gather_reduction = []
01897
01898 ppe_tree = b.comp(
01899 loop_init_stmt,
01900 spe_param_decl,
01901 b.newline(),
01902 all_loop_count_decl,
01903 spe_num_decls,
01904 b.newline(),
01905 unit_num_decl,
01906 per_spe_loop_count_decl,
01907 speid_decl,
01908 program_decl,
01909 b.newline(),
01910 b.for_ (b.assign(s.spe_id , 0),
01911 s.spe_id < s.spe_num,
01912 b.postinc(s.spe_id),
01913 b.comp(b.decl(spe_obj_typespec, 'cur_spe'),
01914 b.assign(s.cur_spe , get_spe),
01915 emit_spe_program_load(s.cur_spe, program_obj),
01916 b.newline(),
01917 init_param_stmts,
01918 b.newline(),
01919 emit_run(s.cur_spe, b.addr(params_var[s.spe_id]))
01920 )
01921 ),
01922
01923 b.newline(),
01924 emit_wait(get_spe_template%'spe_num-1'),
01925 gather_reduction,
01926 )
01927
01928
01929
01930 s = b.clear_scope()
01931
01932 (buffer_decl_global, buffer_decl_local) = emit_buffer_declaration(b, memsize, 'spe_buffer', buf_size, memop_tree,
01933 memory_operation.invariants,
01934 emit_env)
01935
01936 use_vars = bitmap_var_info(cfg.var_info, li.use)
01937 use_kill_decls = decl_var_list(b, use_vars, emit_env.variables)
01938
01939 modify_vars = bitmap_var_info(cfg.var_info, li.modify)
01940 use_kill_decls.append(decl_var_list(b, modify_vars, emit_env.variables))
01941
01942 param_typedef_decl = b.decl(spe_param_type_spec, 'spe_param_t', stor_class=STOR_CLASS_TYPEDEF)
01943 param_decl = b.decl('spe_param_t', parameter_var_name, stor_class=STOR_CLASS_STATIC, align=128)
01944
01945 func_args = b.func_argdecl([(b.ullong, 'spe'),
01946 (b.ullong, 'argp'),
01947 (b.ullong, 'envp')])
01948 mfc_get = b.func('mfc_get')
01949 mfc_put = b.func('mfc_put')
01950 mfc_getf = b.func('mfc_getf')
01951 mfc_putf = b.func('mfc_putf')
01952 ctrump_put_unaligned_func = b.func('ctrump_put_unaligned')
01953
01954 def ctrump_put_unaligned(ls,ea,size,tag,tid,rid):
01955 if isinstance(size,int) and ((size == 1) or (size == 2) or (size == 4) or (size == 8)):
01956 return mfc_put(ls,ea,size,tag,tid,rid)
01957
01958 return ctrump_put_unaligned_func(ls,ea,size,tag,tid,rid)
01959
01960 mfc_write_tag_mask = b.func('mfc_write_tag_mask')
01961 mfc_read_tag_status_all = b.func('mfc_read_tag_status_all')
01962
01963 spe_param = s[parameter_var_name]
01964 init_param_stmts = []
01965
01966 for (orig_t,param_t,var) in invariants_and_incptrs:
01967 name = var.name
01968 expr = spe_in_param_member(have_spe_output, spe_param, name)
01969 pointer_texpr = ctrump.type_apply_unary_pointer_conversion(orig_t)
01970 if pointer_texpr:
01971 texpr = ea_type
01972 name = 'ea_' + str(name)
01973 else:
01974 texpr = orig_t
01975
01976 init_param_stmts.append(b.decl(texpr, name, init=expr))
01977
01978 decl_reduction = []
01979
01980 calc_loop_count = []
01981 calc_loop_count.append(
01982 [b.decl(b.int_, 'per_spe_loop_count', init = spe_in_param_member(have_spe_output, spe_param, 'loop_count')),
01983 b.decl(b.int_, 'spe_id', init = spe_in_param_member(have_spe_output, spe_param, 'spe_id')),
01984 b.decl(b.int_, 'is_last_spe', init = spe_in_param_member(have_spe_output, spe_param, 'is_last_spe')),
01985 b.decl(ea_type, 'sync_buffer_ea', init = spe_in_param_member(have_spe_output, spe_param, 'sync_buffer_ea')),
01986 b.newline(),
01987 b.if_(s.is_last_spe,
01988 b.comp(b.assign(s.per_spe_loop_count ,
01989 (spe_in_param_member(have_spe_output, spe_param, 'all_loop_count')-
01990 (s.per_spe_loop_count*s.spe_id))))),
01991 b.decl(b.int_, 'start_offset',init = spe_in_param_member(have_spe_output, spe_param, 'start'))
01992 ])
01993
01994 for i in li.reductions:
01995 name = i.var.name
01996
01997 if name in emit_env.variables:
01998 decl_reduction.append(b.assign(s[name], 0))
01999 else:
02000 declred[name] = True
02001 decl_reduction.append(b.decl(i.var.type, i.var.name, init = 0))
02002
02003 unaligned_decl = []
02004
02005 mem_access_emitted = {}
02006
02007 class MfcOps:
02008 def __init__(self):
02009 self.write_tag_mask = mfc_write_tag_mask
02010 self.read_tag_status_all = mfc_read_tag_status_all
02011 self.get = mfc_get
02012 self.put = mfc_put
02013 self.putf = mfc_putf
02014 self.getf = mfc_getf
02015 self.put_unaligned = ctrump_put_unaligned
02016
02017 mfc_ops = MfcOps()
02018
02019 invariant_get_stmts = []
02020 invariant_put_stmts = []
02021 invariant_adjust_buffer_stmts = []
02022
02023 for i in memory_operation.invariants:
02024 emit_memory_access(b, i,
02025 align_size, mfc_ops, emit_env,
02026 s.start_offset, memop_tree.loop_node.nest_level, mem_access_emitted)
02027
02028 invariant_get_stmts.append(i.scalar_dma_get_stmts)
02029 invariant_put_stmts.append(i.scalar_dma_put_stmts)
02030
02031 pointer_type = b.pointer_to(i.load_data_type())
02032 buffer = i.buffer_var.buffer
02033 mod = i.buffer_var.mod_var
02034
02035 if double_buffering:
02036 buffer_addr = byte_offset_ptr_add(b, pointer_type, buffer[0], mod[0])
02037 else:
02038 buffer_addr = byte_offset_ptr_add(b, pointer_type, buffer, mod)
02039
02040 invariant_adjust_buffer_stmts.append(b.assign(i.loop_pointer, buffer_addr))
02041
02042 for i in memory_operation.invariants:
02043 replace_array_ref_expr(b, i, double_buffering, ignore_unaligned_data)
02044
02045 stmts = emit_loop_node(b, memsize, memop_tree, emit_env,
02046 mfc_ops, s.per_spe_loop_count, s.start_offset,
02047 memop_tree.loop_node.nest_level, mem_access_emitted, seq_map)
02048
02049 typedef_invariants = {}
02050
02051 for i in invariants:
02052 var = i.var
02053 texpr = var.type
02054 append_typedefs(texpr, typedef_invariants)
02055
02056 loop_blocks = get_loop_blocks(loop_info_node.loop_node.cfg_info)
02057
02058 iv_replaced = {}
02059
02060 top_loop_nest_level = memop_tree.loop_node.nest_level
02061 block_size = memsize.block_elem_count
02062
02063 for bb in loop_blocks:
02064 for r in bb.load_store.stores:
02065 if (not r.var_ref in iv_replaced):
02066 iv_replaced[r.var_ref] = True
02067
02068 for r in bb.load_store.refs:
02069 for x in emit_env.all_block_iv:
02070
02071 if x.iv_level == top_loop_nest_level:
02072 replace_expr = b.paren(b.varref(x.var)+s['block_%s'%x.var.name]*block_size+s.start_offset)
02073 else:
02074 replace_expr = b.paren(b.varref(x.var)+s['block_%s'%x.var.name]*block_size)
02075
02076 if (not r.ref_expr in iv_replaced and r.ref_expr.var.name == x.var.name):
02077 iv_replaced[r.ref_expr] = True
02078 r.ref_expr.replace(b.build_expr(replace_expr))
02079
02080 for x in emit_env.all_scalar_iv:
02081 if x.iv_level == top_loop_nest_level:
02082 replace_expr = b.paren(b.varref(x.var)+s.start_offset)
02083 else:
02084 replace_expr = b.varref(x.var)
02085
02086 if (not r.ref_expr in iv_replaced and r.ref_expr.var.name == x.var.name):
02087 iv_replaced[r.ref_expr] = True
02088 r.ref_expr.replace(b.build_expr(replace_expr))
02089
02090
02091 typedef_invariants_stmt = []
02092 struct_list = []
02093 for i in typedef_invariants:
02094 c = i.code
02095 if c == ctrump.TYPE_TYPEDEF_NAME:
02096 ctrump.spe.convert_type_ppe_to_spe(b, i.defined_to,
02097 emit_env.structs,
02098 emit_env.struct_pending,
02099 emit_env.struct_list, abi)
02100 struct_list.append(type)
02101
02102 pendings = emit_env.struct_pending
02103 while len(pendings) > 0:
02104 emit_env.struct_pending = []
02105 for i in pendings:
02106 ctrump.spe.convert_type_ppe_to_spe(b, i, emit_env.structs, emit_env.struct_pending, emit_env.struct_list, abi)
02107
02108 pendings = emit_env.struct_pending
02109
02110 emit_env.struct_list.reverse()
02111 for tp in emit_env.struct_list:
02112 typedef_invariants_stmt.append(b.struct_decl(tp))
02113
02114 if have_spe_output:
02115 put_reduction = [b.newline()]
02116
02117 for i in li.reductions:
02118 name = i.var.name
02119 put_reduction.append(b.assign(spe_param.member('out').member(name),
02120 s[name]))
02121
02122 put_reduction.append(mfc_put(b.addr(spe_param), s.argp, b.sizeof(s[parameter_var_name]), 0, 0, 0))
02123
02124 put_reduction.append(b.newline())
02125 else:
02126 put_reduction = []
02127
02128 last_wait = [mfc_write_tag_mask(b.bcmpl(b.literal(0))),
02129 mfc_read_tag_status_all()]
02130
02131 main_body = [
02132 buffer_decl_local,
02133 use_kill_decls,
02134 unaligned_decl,
02135 decl_reduction,
02136 b.newline(),
02137 mfc_get(b.addr(spe_param), s.argp, b.sizeof(s[parameter_var_name]), 0, 0, 0),
02138 mfc_write_tag_mask(1),
02139 mfc_read_tag_status_all(),
02140 b.newline(),
02141 init_param_stmts,
02142 b.newline(),
02143 calc_loop_count,
02144 invariant_get_stmts,
02145 invariant_adjust_buffer_stmts,
02146 b.newline(),
02147 stmts,
02148 invariant_put_stmts,
02149 put_reduction,
02150 last_wait,
02151 emit_spe_end(have_spe_output, s.spe_id, s.is_last_spe, s.sync_buffer_ea)
02152 ]
02153 main = b.func_def(b.int_,'main',func_args,main_body)
02154
02155 includes = [
02156 b.include("spu_mfcio.h"),
02157 b.include("spu_intrinsics.h"),
02158 b.include("ctrump.h"),
02159 spe_runtime.emit_include()
02160 ]
02161
02162 extdefs = [
02163 includes,
02164 b.newline(),
02165 typedef_invariants_stmt,
02166 b.newline(),
02167 param_struct_decl,
02168 param_typedef_decl,
02169 b.newline(),
02170 param_decl,
02171 buffer_decl_global,
02172 b.newline(),
02173 main
02174 ]
02175
02176 spe_prog = b.translation_unit(extdefs, prog.filename)
02177
02178 stmt.replace(b.build_stmt(ppe_tree))
02179
02180 new_prog = b.translation_unit([b.include('ctrump/runtime/ctrump-libspe2.h'),
02181 b.newline(),
02182 prog.decls], prog.filename)
02183
02184 return [new_prog, spe_prog]
02185
02186 def is_enabled(self, analyze_results, stmt, cfg, loop_node, errors, warnings, hints):
02187 is_for_stmt = (stmt.code == ctrump.STMT_FOR)
02188 if not is_for_stmt:
02189 (stmt_loc_path, stmt_loc_line) = ctrump.get_stmt_loc(stmt)
02190 errors.append(OptimizationError([(stmt_loc_path, stmt_loc_line)],
02191 "ループがforループではありません."))
02192
02193 if loop_node.dep_vec_dir == 0:
02194 pass
02195 else:
02196 (stmt_loc_path, stmt_loc_line) = ctrump.get_stmt_loc(stmt)
02197
02198 errors.append(OptimizationError([(stmt_loc_path, stmt_loc_line)],
02199 'ループ間に依存があります'))
02200
02201 if loop_node.dep_vec_dir == loop_node.ANTI_DEP:
02202 hints.append(OptimizationError([(stmt_loc_path, stmt_loc_line)],
02203 '依存は逆依存(anti-dependency)です。出力バッファと入力バッファを分けると並列化できます。'))
02204
02205 return False
02206
02207 return (is_for_stmt and
02208 (loop_node.error_bits == 0))
02209
02210
02211 def init_optimizer(engine):
02212 loop_optimizer = engine.get_loop_optimizer()
02213
02214 loop_optimizer.append_translator(OffloadSpeTranslator(),
02215 [LoopParallelAnalyzer])