3 * Copyright 2007,2008,2009 Free Software Foundation, Inc.
5 * This file is part of GNU Radio
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 // #define ENABLE_GC_LOGGING // define to enable logging
24 #include <spu_intrinsics.h>
25 #include <spu_mfcio.h>
26 #include <sync_utils.h>
27 #include "gc_spu_config.h"
28 #include "spu_buffers.h"
29 #include <gcell/gc_spu_args.h>
30 #include <gcell/gc_job_desc.h>
31 #include <gcell/gc_mbox.h>
32 #include <gcell/gc_declare_proc.h>
33 #include <gcell/spu/gc_jd_queue.h>
34 #include <gcell/spu/gc_random.h>
35 #include <gcell/spu/gc_delay.h>
42 #define MIN(a,b) ((a) < (b) ? (a) : (b))
43 #define MAX(a,b) ((a) > (b) ? (a) : (b))
45 //! round x down to p2 boundary (p2 must be a power-of-2)
46 #define ROUND_DN(x, p2) ((x) & ~((p2)-1))
48 //! round x up to p2 boundary (p2 must be a power-of-2)
49 #define ROUND_UP(x, p2) (((x)+((p2)-1)) & ~((p2)-1))
52 //#define OUT_MBOX_CHANNEL SPU_WrOutIntrMbox
53 #define OUT_MBOX_CHANNEL SPU_WrOutMbox
55 #define CHECK_QUEUE_ON_MSG 0 // define to 0 or 1
56 #define USE_LLR_LOST_EVENT 0 // define to 0 or 1
58 int gc_sys_tag; // tag for misc DMA operations
59 static gc_spu_args_t spu_args;
61 static struct gc_proc_def *gc_proc_def; // procedure entry points
63 // ------------------------------------------------------------------------
65 // state for DMA'ing arguments in and out
67 static int get_tag; // 1 tag for job arg gets
68 static int put_tags; // 2 tags for job arg puts
70 static int pb_idx = 0; // current put buffer index (0 or 1)
72 // bitmask (bit per put buffer): bit is set if DMA is started but not complete
73 static int put_in_progress = 0;
74 #define PBI_MASK(_pbi_) (1 << (_pbi_))
76 // ------------------------------------------------------------------------
78 // our working copy of the completion info
79 static gc_comp_info_t comp_info = {
84 static int ci_idx = 0; // index of current comp_info
85 static int ci_tags; // two consecutive dma tags
87 // ------------------------------------------------------------------------
90 * Wait until EA copy of comp_info[idx].in_use is 0
93 wait_for_ppe_to_be_done_with_comp_info(int idx)
96 char *buf = (char *) ALIGN(_tmp, 128); // get cache-aligned buffer
97 gc_comp_info_t *p = (gc_comp_info_t *) buf;
99 assert(sizeof(gc_comp_info_t) == 128);
102 mfc_get(buf, spu_args.comp_info[idx], 128, gc_sys_tag, 0, 0);
103 mfc_write_tag_mask(1 << gc_sys_tag);
104 mfc_read_tag_status_all();
114 flush_completion_info(void)
118 static int total_complete = 0;
120 if (comp_info.ncomplete == 0)
123 // ensure that PPE is done with the buffer we're about to overwrite
124 wait_for_ppe_to_be_done_with_comp_info(ci_idx);
126 // dma the comp_info out to PPE
127 int tag = ci_tags + ci_idx;
128 mfc_put(&comp_info, spu_args.comp_info[ci_idx], sizeof(gc_comp_info_t), tag, 0, 0);
130 // we need to wait for the completion info to finish, as well as
131 // any EA argument puts.
133 int tag_mask = 1 << tag; // the comp_info tag
134 if (put_in_progress & PBI_MASK(0))
135 tag_mask |= (1 << (put_tags + 0));
136 if (put_in_progress & PBI_MASK(1))
137 tag_mask |= (1 << (put_tags + 1));
139 gc_log_write2(GCL_SS_SYS, 0x30, put_in_progress, tag_mask);
141 mfc_write_tag_mask(tag_mask); // the tags we're interested in
142 mfc_read_tag_status_all(); // wait for DMA to complete
143 put_in_progress = 0; // mark them all complete
145 total_complete += comp_info.ncomplete;
146 gc_log_write4(GCL_SS_SYS, 0x31,
147 put_in_progress, ci_idx, comp_info.ncomplete, total_complete);
149 // send PPE a message
150 spu_writech(OUT_MBOX_CHANNEL, MK_MBOX_MSG(OP_JOBS_DONE, ci_idx));
152 ci_idx ^= 0x1; // switch buffers
153 comp_info.in_use = 1;
154 comp_info.ncomplete = 0;
157 // ------------------------------------------------------------------------
160 static unsigned int backoff; // current backoff value in clock cycles
161 static unsigned int _backoff_start;
162 static unsigned int _backoff_cap;
167 * 10 1023 cycles 320 ns
168 * 11 2047 cycle 640 ns
169 * 12 4095 cycles 1.3 us
170 * 13 8191 cycles 2.6 us
171 * 14 16383 cycles 5.1 us
172 * 15 32767 cycles 10.2 us
180 static unsigned char log2_backoff_start[16] = {
181 // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
182 // -------------------------------------------------------------
183 //12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16
184 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11
187 static unsigned char log2_backoff_cap[16] = {
188 // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
189 // -------------------------------------------------------------
190 //17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 21, 21
191 13, 14, 14, 14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16
197 _backoff_cap = (1 << (log2_backoff_cap[(spu_args.nspus - 1) & 0xf])) - 1;
198 _backoff_start = (1 << (log2_backoff_start[(spu_args.nspus - 1) & 0xf])) - 1;
200 backoff = _backoff_start;
203 #if !CHECK_QUEUE_ON_MSG
208 backoff = _backoff_start;
212 #define RANDOM_WEIGHT 0.2
219 // capped exponential backoff
220 backoff = ((backoff << 1) + 1);
221 if (backoff > _backoff_cap)
222 backoff = _backoff_cap;
224 // plus some randomness
225 float r = (RANDOM_WEIGHT * (2.0 * (gc_uniform_deviate() - 0.5)));
226 backoff = backoff * (1.0 + r);
229 #endif // !CHECK_QUEUE_ON_MSG
231 // ------------------------------------------------------------------------
233 static inline unsigned int
236 return ~(~0 << nbits);
239 static unsigned int dc_work;
240 static int dc_put_tag;
241 static unsigned char *dc_ls_base;
242 static gc_eaddr_t dc_ea_base;
244 // divide and conquer
246 d_and_c(unsigned int offset, unsigned int len)
248 unsigned int mask = make_mask(len) << offset;
249 unsigned int t = mask & dc_work;
250 if (t == 0) // nothing to do
252 if (t == mask){ // got a match, generate dma
253 mfc_put(dc_ls_base + offset, dc_ea_base + offset, len, dc_put_tag, 0, 0);
257 d_and_c(offset, len);
258 d_and_c(offset + len, len);
262 // Handle the nasty case of a dma xfer that's less than 16 bytes long.
263 // len is guaranteed to be in [1, 15]
266 handle_slow_and_tedious_dma(gc_eaddr_t ea, unsigned char *ls,
267 unsigned int len, int put_tag)
269 // Set up for divide and conquer
270 unsigned int alignment = ((uintptr_t) ls) & 0x7;
271 dc_work = make_mask(len) << alignment;
272 dc_ls_base = (unsigned char *) ROUND_DN((uintptr_t) ls, 8);
273 dc_ea_base = ROUND_DN(ea, (gc_eaddr_t) 8);
274 dc_put_tag = put_tag;
283 process_job(gc_eaddr_t jd_ea, gc_job_desc_t *jd)
287 jd->status = JS_OK; // assume success
289 if (jd->proc_id >= spu_args.nproc_defs)
290 jd->status = JS_UNKNOWN_PROC;
294 if (jd->eaa.nargs == 0)
295 (*gc_proc_def[jd->proc_id].proc)(&jd->input, &jd->output, &jd->eaa);
297 else { // handle EA args that must be DMA'd in/out
299 gc_job_ea_args_t *eaa = &jd->eaa;
303 (GC_SPU_BUFSIZE + MFC_MAX_DMA_SIZE - 1) / MFC_MAX_DMA_SIZE);
305 mfc_list_element_t dma_get_list[NELMS];
306 //mfc_list_element_t dma_put_list[NELMS];
308 memset(dma_get_list, 0, sizeof(dma_get_list));
309 //memset(dma_put_list, 0, sizeof(dma_put_list));
311 int gli = 0; // get list index
312 //int pli = 0; // put list index
314 unsigned char *get_base = _gci_getbuf[0];
315 unsigned char *get_t = get_base;
316 unsigned int total_get_dma_len = 0;
318 unsigned char *put_base = _gci_putbuf[pb_idx];
319 unsigned char *put_t = put_base;
320 unsigned int total_put_alloc = 0;
321 int put_tag = put_tags + pb_idx;
323 // Do we have any "put" args? If so ensure that previous
324 // dma from this buffer is complete
326 gc_log_write2(GCL_SS_SYS, 0x24, put_in_progress, jd->sys.direction_union);
328 if ((jd->sys.direction_union & GCJD_DMA_PUT)
329 && (put_in_progress & PBI_MASK(pb_idx))){
331 gc_log_write2(GCL_SS_SYS, 0x25, put_in_progress, 1 << put_tag);
333 mfc_write_tag_mask(1 << put_tag); // the tag we're interested in
334 mfc_read_tag_status_all(); // wait for DMA to complete
335 put_in_progress &= ~(PBI_MASK(pb_idx));
337 gc_log_write1(GCL_SS_SYS, 0x26, put_in_progress);
341 // for now, all EA's must have the same high 32-bits
342 gc_eaddr_t common_ea = eaa->arg[0].ea_addr;
345 // assign LS addresses for buffers
347 for (unsigned int i = 0; i < eaa->nargs; i++){
349 gc_eaddr_t ea_base = 0;
350 unsigned char *ls_base;
352 unsigned int dma_len;
354 if (eaa->arg[i].direction == GCJD_DMA_GET){
355 ea_base = ROUND_DN(eaa->arg[i].ea_addr, (gc_eaddr_t) CACHE_LINE_SIZE);
356 offset = eaa->arg[i].ea_addr & (CACHE_LINE_SIZE-1);
357 dma_len = ROUND_UP(eaa->arg[i].get_size + offset, CACHE_LINE_SIZE);
358 total_get_dma_len += dma_len;
360 if (total_get_dma_len > GC_SPU_BUFSIZE){
361 jd->status = JS_ARGS_TOO_LONG;
367 eaa->arg[i].ls_addr = ls_base + offset;
370 assert((mfc_ea2l(eaa->arg[i].ea_addr) & 0x7f) == ((intptr_t)eaa->arg[i].ls_addr & 0x7f));
371 assert((ea_base & 0x7f) == 0);
372 assert(((intptr_t)ls_base & 0x7f) == 0);
373 assert((dma_len & 0x7f) == 0);
374 assert((eaa->arg[i].get_size <= dma_len)
375 && dma_len <= (eaa->arg[i].get_size + offset + CACHE_LINE_SIZE - 1));
378 // add to dma get list
379 // FIXME (someday) the dma list is where the JS_BAD_EAH limitation comes from
381 while (dma_len != 0){
382 int n = MIN(dma_len, MFC_MAX_DMA_SIZE);
383 dma_get_list[gli].size = n;
384 dma_get_list[gli].eal = mfc_ea2l(ea_base);
391 else if (eaa->arg[i].direction == GCJD_DMA_PUT){
393 // This case is a trickier than the PUT case since we can't
394 // write outside of the bounds of the user provided buffer.
395 // We still align the buffers to 128-bytes for good performance
396 // in the middle portion of the xfers.
398 ea_base = ROUND_DN(eaa->arg[i].ea_addr, (gc_eaddr_t) CACHE_LINE_SIZE);
399 offset = eaa->arg[i].ea_addr & (CACHE_LINE_SIZE-1);
401 uint32_t ls_alloc_len =
402 ROUND_UP(eaa->arg[i].put_size + offset, CACHE_LINE_SIZE);
404 total_put_alloc += ls_alloc_len;
406 if (total_put_alloc > GC_SPU_BUFSIZE){
407 jd->status = JS_ARGS_TOO_LONG;
412 put_t += ls_alloc_len;
413 eaa->arg[i].ls_addr = ls_base + offset;
416 assert((mfc_ea2l(eaa->arg[i].ea_addr) & 0x7f)
417 == ((intptr_t)eaa->arg[i].ls_addr & 0x7f));
418 assert((ea_base & 0x7f) == 0);
419 assert(((intptr_t)ls_base & 0x7f) == 0);
427 // fire off the dma to fetch the args and wait for it to complete
428 mfc_getl(get_base, common_ea, dma_get_list, gli*sizeof(dma_get_list[0]), get_tag, 0, 0);
429 mfc_write_tag_mask(1 << get_tag); // the tag we're interested in
430 mfc_read_tag_status_all(); // wait for DMA to complete
433 (*gc_proc_def[jd->proc_id].proc)(&jd->input, &jd->output, &jd->eaa);
436 // Do we have any "put" args? If so copy them out
437 if (jd->sys.direction_union & GCJD_DMA_PUT){
439 // Do the copy out using single DMA xfers. The LS ranges
440 // aren't generally contiguous.
442 bool started_dma = false;
444 for (unsigned int i = 0; i < eaa->nargs; i++){
445 if (eaa->arg[i].direction == GCJD_DMA_PUT && eaa->arg[i].put_size != 0){
453 ea = eaa->arg[i].ea_addr;
454 ls = (unsigned char *) eaa->arg[i].ls_addr;
455 len = eaa->arg[i].put_size;
458 handle_slow_and_tedious_dma(ea, ls, len, put_tag);
461 if ((ea & 0xf) != 0){
463 // printf("1: ea = 0x%x len = %5d\n", (int) ea, len);
465 // handle the "pre-multiple-of-16" portion
466 // do 1, 2, 4, or 8 byte xfers as required
468 if (ea & 0x1){ // do a 1-byte xfer
469 mfc_put(ls, ea, 1, put_tag, 0, 0);
474 if (ea & 0x2){ // do a 2-byte xfer
475 mfc_put(ls, ea, 2, put_tag, 0, 0);
480 if (ea & 0x4){ // do a 4-byte xfer
481 mfc_put(ls, ea, 4, put_tag, 0, 0);
486 if (ea & 0x8){ // do an 8-byte xfer
487 mfc_put(ls, ea, 8, put_tag, 0, 0);
495 // printf("2: ea = 0x%x len = %5d\n", (int) ea, len);
496 assert((ea & 0xf) == 0);
497 assert((((intptr_t) ls) & 0xf) == 0);
500 // handle the "multiple-of-16" portion
502 int aligned_len = ROUND_DN(len, 16);
503 len = len & (16 - 1);
505 while (aligned_len != 0){
506 int dma_len = MIN(aligned_len, MFC_MAX_DMA_SIZE);
507 mfc_put(ls, ea, dma_len, put_tag, 0, 0);
510 aligned_len -= dma_len;
514 // printf("3: ea = 0x%x len = %5d\n", (int)ea, len);
515 assert((ea & 0xf) == 0);
516 assert((((intptr_t) ls) & 0xf) == 0);
519 // handle "post-multiple-of-16" portion
523 if (len >= 8){ // do an 8-byte xfer
524 mfc_put(ls, ea, 8, put_tag, 0, 0);
529 if (len >= 4){ // do a 4-byte xfer
530 mfc_put(ls, ea, 4, put_tag, 0, 0);
535 if (len >= 2){ // do a 2-byte xfer
536 mfc_put(ls, ea, 2, put_tag, 0, 0);
541 if (len >= 1){ // do a 1-byte xfer
542 mfc_put(ls, ea, 1, put_tag, 0, 0);
554 put_in_progress |= PBI_MASK(pb_idx); // note it's running
555 gc_log_write2(GCL_SS_SYS, 0x27, put_in_progress, pb_idx);
556 pb_idx ^= 1; // toggle current buffer
562 wrap_up:; // semicolon creates null statement for C99 compliance
564 // Copy job descriptor back out to EA.
565 // (The dma will be waited on in flush_completion_info)
566 int tag = ci_tags + ci_idx; // use the current completion tag
567 mfc_put(jd, jd_ea, sizeof(*jd), tag, 0, 0);
569 // Tell PPE we're done with the job.
571 // We queue these up until we run out of room, or until we can send
572 // the info to the PPE w/o blocking. The blocking check is in
575 comp_info.job_id[comp_info.ncomplete++] = jd->sys.job_id;
577 if (comp_info.ncomplete == GC_CI_NJOBS){
578 gc_log_write0(GCL_SS_SYS, 0x28);
579 flush_completion_info();
588 static gc_job_desc_t jd; // static gets us proper alignment
592 #if (USE_LLR_LOST_EVENT)
594 spu_writech(SPU_WrEventMask, MFC_LLR_LOST_EVENT);
597 while (gc_jd_queue_dequeue(spu_args.queue, &jd_ea, ci_tags + ci_idx, &jd))
598 process_job(jd_ea, &jd);
599 // we're now holding a lock-line reservation
604 #if !CHECK_QUEUE_ON_MSG
605 #if (USE_LLR_LOST_EVENT)
607 if (unlikely(spu_readchcnt(SPU_RdEventStat))){
609 // execute standard event handling prologue
611 int status = spu_readch(SPU_RdEventStat);
612 int mask = spu_readch(SPU_RdEventMask);
613 spu_writech(SPU_WrEventMask, mask & ~status); // disable active events
614 spu_writech(SPU_WrEventAck, status); // ack active events
616 // execute per-event actions
618 if (status & MFC_LLR_LOST_EVENT){
620 // We've lost a line reservation. This is most likely caused
621 // by somebody doing something to the queue. Go look and see
622 // if there's anything for us.
624 while (gc_jd_queue_dequeue(spu_args.queue, &jd_ea, ci_tags + ci_idx, &jd) == GCQ_OK)
625 process_job(jd_ea, &jd);
629 // execute standard event handling epilogue
631 spu_writech(SPU_WrEventMask, mask); // restore event mask
636 // try to get a job from the job queue
637 if (gc_jd_queue_dequeue(spu_args.queue, &jd_ea, ci_tags + ci_idx, &jd) == GCQ_OK){
639 gc_log_write2(GCL_SS_SYS, 0x10, jd.sys.job_id, total_jobs);
641 process_job(jd_ea, &jd);
643 gc_log_write2(GCL_SS_SYS, 0x11, jd.sys.job_id, total_jobs);
654 if (unlikely(spu_readchcnt(SPU_RdInMbox))){
655 int msg = spu_readch(SPU_RdInMbox);
656 // printf("spu[%d] mbox_msg: 0x%08x\n", spu_args.spu_idx, msg);
657 #if CHECK_QUEUE_ON_MSG
658 if (MBOX_MSG_OP(msg) == OP_CHECK_QUEUE){
661 //int delay = (int)(3200.0 * gc_uniform_deviate()); // uniformly in [0, 1.0us]
664 gc_dequeue_status_t s =
665 gc_jd_queue_dequeue(spu_args.queue, &jd_ea, ci_tags + ci_idx, &jd);
669 gc_log_write2(GCL_SS_SYS, 0x10, jd.sys.job_id, total_jobs);
671 process_job(jd_ea, &jd);
673 gc_log_write2(GCL_SS_SYS, 0x11, jd.sys.job_id, total_jobs);
675 else if (s == GCQ_EMPTY){
678 else { // GCQ_LOCKED -- keep trying
684 if (MBOX_MSG_OP(msg) == OP_EXIT){
685 flush_completion_info();
688 else if (MBOX_MSG_OP(msg) == OP_GET_SPU_BUFSIZE){
689 spu_writech(SPU_WrOutIntrMbox, MK_MBOX_MSG(OP_SPU_BUFSIZE, GC_SPU_BUFSIZE_BASE));
693 // If we've got job completion info for the PPE and we can send a
694 // message without blocking, do it.
696 if (comp_info.ncomplete != 0 && spu_readchcnt(OUT_MBOX_CHANNEL) != 0){
697 gc_log_write0(GCL_SS_SYS, 0x12);
698 flush_completion_info();
705 main(unsigned long long spe_id __attribute__((unused)),
706 unsigned long long argp,
707 unsigned long long envp __attribute__((unused)))
709 gc_sys_tag = mfc_tag_reserve(); // allocate a tag for our misc DMA operations
710 get_tag = mfc_tag_reserve();
711 ci_tags = mfc_multi_tag_reserve(2);
712 put_tags = mfc_multi_tag_reserve(2);
715 mfc_get(&spu_args, argp, sizeof(spu_args), gc_sys_tag, 0, 0);
716 mfc_write_tag_mask(1 << gc_sys_tag); // the tag we're interested in
717 mfc_read_tag_status_all(); // wait for DMA to complete
719 // initialize pointer to procedure entry table
720 gc_proc_def = (gc_proc_def_t *) spu_args.proc_def_ls_addr;
722 gc_set_seed(spu_args.spu_idx);
724 // initialize logging
725 _gc_log_init(spu_args.log);
727 backoff_init(); // initialize backoff parameters