.sect "icode" .global _evtMgrCtrl ;The event manager control structure. .global _histoCtrl ;The histogram control structure. .global _histoEvent_asm ;*************************************************************************************** ; ; synopsis: Assembly language version of the Pixel histogramming code. The approach ; loosely follows that taken by the c code, in that the code is organized ; (after initialization) into 3 main blocks: ; a) The frame loop, which handles loading the initial pointers for each new ; data frame (== 256 words), ; b) the header-checking loop which handles assignment of the base pointers to ; the histograms and ; c) the data loop, which actually processes the hit data. ; ; WARNING! Assigning registers like this allows writing assembly which appears similar ; to linear assembly (.sa). Writing in .sa is near impossible since without ; optimization branch instructions can not be placed in parallel with others (and is ; also essentially equivalent to writing in assembly with .set registers like this), ; and when the optimizer is enabled it is unable to effectively understand the ; densely packed inner loop & insists on poor 'optimizations' which considerably ; increase the loop size instead. Using .set and straight assembly code provides the ; best of both alternatives, *but* if the register assignments shown here are altered ; care must be taken to see that the new assignments do not interfere with each other, ; esp. inside the 2 major loops in the code (frame & data loops). The routine is ; highly optimized & to improve speed & reduce code size some registers are re-used ; (esp. the masks & temporary variables) ==> those sections of the code can break ; easily if intervening code is inserted. Assembly is very unforgiving. The file ; asm-registers.txt contains a text table showing the different sections' register ; assignments in parallel. ; ; Also, the C & linear assembly compilers automatically align the stack pointer on ; a 2-word boundary. The hand assembly code *must* ensure that the stack is aligned ; on a 2-word boundary as well, since it is interruptible. This is not checked at ; compile time! ; ; Since this is a assembly file, including standard c header files does not work, ; meaning that #defined constants and c structure definitions are unavailable. ; Constants & structures from the c code are hard-coded into the assembly source ; code. These include: ; a) the EventData structure & field positions within data1 & data2, ; and b) the event manager control structure (from eventHandler.h), ; and also c) some members of the histogram control structure (from histogram.h) ; ; author: Douglas Ferguson ;************************************************************************************ ; ;Program Units/instructions summary: S= Shift, L= Logic, D= Data ; SHL/R: S, MVK: S, MV: SLD, AND: SL, ADD: SLD, SUB: SLD, ; CMP*: L, EXT*: S, NOT: SL, X/OR: SL, LD*: D, ST*: D ; ; abbreviations: ; N[##]: indicates that a note is appended at the function end for the instruction ; HCS: histogram control structure ; EDS: event data structure. ; LUT: link # => module # lookup table. ; ; Registers below which share a register with another have a SHARE indicated in their ; comment. These register pairs need special care since they must be used only in ; their own sections of the loops. ; ; histogramming loop registers: ; A15 reserved (frame ptr) for functions with many inputs (ok here), ; B14 & B15 reserved (data page ptr, stack ptr) ; valid conditionals: a1, a2, b0, b1, b2 ; ; valid conditionals are a1, a2, b0, b1 & b2 ; ;Registers used in the data loop: dAa .set a0 ;side A auxiliary decision/temp dA .set a1 ;side A decision sA .set a2 ;side A storage decision dataA .set a3 ;side A data ccA .set a4 ;combined chip & channel for side A data mask .set a5 ;mask for link-header checks. thA .set a6 ;indicates that side A data contains 2 hits. thB .set a7 ;indicates that side B data contains 2 hits. hPtrA .set a8 ;side A occupancy histogram pointer hDataA0 .set a9 ;side A occupancy histogram data (1st hit) hDataA1 .set a10 ;side A occupancy histogram data (2nd hit) hBaseA .set a11 ;occupancy histogram base (A side copy) sPtr .set a12 ;pointer to the current word. dX .set b0 ;side B auxiliary decision: can be used conditionally. dB .set b1 ;side B decision sB .set b2 ;side B storage decision dataB .set b3 ;side B data ccB .set b4 ;combined chip & channel for side B data ccm .set b5 ;chip/channel mask: used to signal a data cluster link .set b6 ;the link field of a link-header. hLPtr .set b7 ;pointer to the histogram/link lUT. hPtrB .set b8 ;side B occupancy histogram pointer hDataB0 .set b9 ;side B occupancy histogram data (1st hit) hDataB1 .set b10 ;side B occupancy histogram data (2nd hit) hBase .set b11 ;occupancy histogram base (B side copy) ePtr .set b12 ;pointer indicating the end of the current block. csrReg .set b2 ;register to store the CSR in while toggling the GIE. sp0 .set a13 ;dbg SP0 reg. address fsrp .set b13 ;dbg toggle fsrp/xp0 (SP0 bit 2=> rp 3=>xp) ;Data masks: header_mask .equ 0x2100 ;These registers are computed or loaded from the stack during pauses from the data loop ;(either in one of the mid-frame pauses or between frames). Since data is still possibly ;continuing the *Base registers *must* be preserved when the loop is re-entered: ;(A) xxxx (B) xxx ;The registers used in the storage sub-loop must remain undisturbed until they're ;no longer needed. Decision registers have been renamed to avoid any confusion with the ;histogramming loop decision registers (unused here). Registers restored from the stack ;are indicated with a R. Recycled registers are indicated with an RCYC. SPA .set a15 ;sTemp .set a0 ;temporary storage xA .set a1 ;RCYC: side A decision 1. yA .set a2 ;RCYC: side A decision 2. nfd .set a3 ;R: # of frames processed ;iWrap .set a4 ;internal wrap addr. ;iSub .set a8 ;internal subtraction addr. ** SHARE: iWrap ** wrapAddr .set a4 ;address at which the frame pointer wraps wSub .set b4 ;subtraction to be done if there is a wrap. ;a6 ;a7 nif .set a9 ;R: # of interal data frames ;xStart .set a11 ;R: starting address of external event frames (if any). fdPtr .set b0 ;R: address which tests whether frame is done. xB .set b1 ;side B decision 1: inter-frame / new frame decision. yB .set b2 ;RCYC: side B decision 2. ndf .set b3 ;R: # of frames containing data in the event. ;xWrap .set b4 ;external wrap addr. ;xSub .set b11 ;external subtraction addr. ** SHARE: xWrap ** ;b5 ;b6 inc .set b4 ;increment to the sPtr lfInc .set b9 ;R: the number of data words in the last data frame. ;offsets to the invalid data pointer and link->module LUT inside the histogram control ;structure, other histogrammin gvariables, and the frame buffer pointers and lengths ;in the event control structure. hc_guard_o .equ 0x3 invData_o .equ 0x1e linkPtr_o .equ 0x18 ;points to the lut pointer. empty_o .equ 0x5 ev_guard_o .equ 0x07 buffPtr0_o .equ 0x08 buffPtr1_o .equ 0x09 buffLen0_o .equ 0x0a buffLen1_o .equ 0x0b ;storage positions for the stack storage of the frame-loop variables during ;data processing, and storage for a10-a15 & b10-b13 +b3 on the stack. These are ;byte offsets from the current stack pointer. The order shown here is inverted; the ;offsets at the top represent the bottom of the stack. To add more storage, simply ;add the extra offsets above the xWrap_stk_o & chain them together. free0_stk_o .equ (4) free1_stk_o .equ (free0_stk_o +4) free2_stk_o .equ (free1_stk_o +4) free3_stk_o .equ (free2_stk_o +4) free4_stk_o .equ (free3_stk_o +4) xWrap_stk_o .equ (free4_stk_o +4) iWrap_stk_o .equ (xWrap_stk_o +4) xLen_stk_o .equ (iWrap_stk_o +4) iLen_stk_o .equ (xLen_stk_o +4) x_stk_o .equ (iLen_stk_o +4) x1_stk_o .equ (x_stk_o +4) hLSave_stk_o .equ (x1_stk_o +4) hLPtr_stk_o .equ (hLSave_stk_o +4) invPtr_stk_o .equ (hLPtr_stk_o +4) lfInc_stk_o .equ (invPtr_stk_o +4) xStart_stk_o .equ (lfInc_stk_o +4) fdPtr_stk_o .equ (xStart_stk_o +4) nfd_stk_o .equ (fdPtr_stk_o +4) ndf_stk_o .equ (nfd_stk_o +4) nif_stk_o .equ (ndf_stk_o +4) free5_stk_o .equ (nif_stk_o +4) b10_stk_o .equ (free5_stk_o +4) a10_stk_o .equ (b10_stk_o +4) b11_stk_o .equ (a10_stk_o +4) a11_stk_o .equ (b11_stk_o +4) b12_stk_o .equ (a11_stk_o +4) a12_stk_o .equ (b12_stk_o +4) b13_stk_o .equ (a12_stk_o +4) a13_stk_o .equ (b13_stk_o +4) a14_stk_o .equ (a13_stk_o +4) b3_stk_o .equ (a14_stk_o +4) a15_stk_o .equ (b3_stk_o +4) delta_stk .equ (a15_stk_o) ;registers needed on start/finish only: eLen .set a0 ;empty length ;xLen .set b0 ;external frame buffer length dAs .set a1 ;decision A dBs .set b1 ;decision B evtCtrl .set a2 ;pointer to the event manager control structure hCtrl .set b2 ;pointer to the histogram control structure maskAst .set a3 ;side A mask maskBst .set b3 ;side B mask ;input registers: evtPtr .set a4 ;pointer to EDS => event parameters. evData1 .set a5 ;1st word of EDS: contains frame location data. evData2 .set b5 ;2nd word of EDS: contains control & error bits + evt. length iWrapStart .set a5 ;compute iWrap at start & store ** SHARE: evData1 ** xWrapStart .set b5 ;compute xWrap at start & store ** SHARE: evData2 ** iLen .set a6 ;internal frame buffer length invPtr .set b6 ;pointer to the invalid data area of the HCS. isf .set a7 ;internal starting frame nxf .set b7 ;# of exteral data frames xsf .set b8 ;external starting frame lmod .set a8 ;the event length % 0x100: decides the # of data frames len .set b9 ;the event length externOffset .set dBs ;offset to xsf delta evD1B .set b12 ;side B copy of evData1 evD2A .set a14 ;side A copy of evData2 iBuff .set a10 ;the internal buffer start xBuff .set b10 ;the external buffer start ; from earlier sections (note conflicts): ;hLPtr .set bx ;link histogram pointer ** SHARE: col ** ;sp0 .set bx ;dbg SP0 reg. address ;fsrp .set bx ;dbg toggle fsrp0 (SP0 bit 2) ;sPtr .set b12 ;pointer to the current word. ;ePtr .set b13 ;pointer indicating the end of the current block. ;reserved registers: DP .set b14 ;Data Pointer: pointer program's bss section (global variables). SP .set b15 ;Stack Pointer _histoEvent_asm ; *event struct (a4), *invalid data (b4), *link base[0] (a6) ;*************************************************************************************** ;Store all unsaved registers which are used (a10-a15, b10-b13, and return reg b3) ;on the stack. Note that a15 is the frame ptr; it may be reserved as well for complex ;functions or those with large numbers of arguments (not the case here). ; MVK 0xa15, a15 ; MVK 0xa14, a14 ; MVK 0xa13, a13 ; || MVK 0xb13, b13 ; MVK 0xa12, a12 ; || MVK 0xb12, b12 ; MVK 0xa11, a11 ; || MVK 0xb11, b11 ; MVK 0xa10, a10 ; || MVK 0xb10, b10 STW .D2T1 SPA,*SP || MVK delta_stk, SPA MVC CSR, csrReg || SUB SP, SPA, SP AND -2, csrReg, csrReg || MV SP, SPA MVC csrReg, CSR ;turn off interrupts STW .D1T1 A14, *+SPA(a14_stk_o) || STW .D2T2 B3, *+SP(b3_stk_o) STW .D1T1 A13, *+SPA(a13_stk_o) || STW .D2T2 B13, *+SP(b13_stk_o) STW .D1T1 A12, *+SPA(a12_stk_o) || STW .D2T2 B12, *+SP(b12_stk_o) STW .D1T1 A11, *+SPA(a11_stk_o) || STW .D2T2 B11, *+SP(b11_stk_o) STW .D1T1 A10, *+SPA(a10_stk_o) || STW .D2T2 B10, *+SP(b10_stk_o) ; B preamble ; NOP 5 preambleRet: MVKL _evtMgrCtrl, evtCtrl || MVKL _histoCtrl, hCtrl MVKH _evtMgrCtrl, evtCtrl || MVKH _histoCtrl, hCtrl ;*************************************************************************************** ; Start of the function: Get the event data words from the input pointer & transfer all ; the needed parameters from the event manager control & histogram control structures ; to the stack and/or dedicated registers. Compute the other necessary event variables ; (#frames to be done, length, etc.) and store them on the stack for later recall. start: ;01-------------------------------------------------------------------------- LDW *+evtPtr[0], evData1 ;get event->data1 || LDW *+hCtrl[linkPtr_o], hLPtr ;get pointer to LUT || MVKL 0x018c0024, sp0 ;DSP's serial port #0 register. ;02-------------------------------------------------------------------------- LDW *+evtPtr[1], evData2 ;get event->data2 || MVKH 0x018c0024, sp0 ;03-------------------------------------------------------------------------- LDW *+evtCtrl[buffPtr1_o], xBuff || LDW *+hCtrl[empty_o], eLen ;get empty length || MVK 0x00ff, maskAst ;04-------------------------------------------------------------------------- LDW *+evtCtrl[buffPtr0_o], iBuff ; || LDW *sp0, fsrp || MV maskAst, maskBst ;05-------------------------------------------------------------------------- ; LDW *+evtCtrl[buffLen1_o], xLen STW maskAst, *+SP(hLSave_stk_o) || LDW *sp0, fsrp ;06-------------------------------------------------------------------------- MV evData1, evD1B || LDW *+evtCtrl[buffLen0_o], iLen ;07-------------------------------------------------------------------------- EXTU evData1, 8, 24, nif ;# internal frames || AND maskBst, evData1, nxf ;# external frames || EXTU evData2, 16, 16, len ;event length || MV evData2, evD2A || STW hLPtr, *+SP(hLPtr_stk_o) ;store LUT ptr on stack ;08-------------------------------------------------------------------------- EXTU evData1, 0, 24, isf ;int starting frame || EXTU evD1B, 16, 24, xsf ;ext starting frame || CMPEQ eLen, len, dBs ;N[01] || STW nif, *+SP(nif_stk_o) ;(01 -> 06) to 08= lifetime of evData1 ;(02 -> 07)= lifetime of evData2 ;(06) to 08= lifetime of evD1B ;(07) to 09= lifetime of evD2A ;09-------------------------------------------------------------------------- SHL isf, 0xa, isf ;multiply by 0x400 || SHL xsf, 0xa, xsf ;multiply by 0x400 || AND evD2A, maskAst, lmod ;data length mod 0x100 || ADD nif, nxf, ndf ;# of event frames. ; || MV nif, ndf ;# of event frames. ;10-------------------------------------------------------------------------- CMPLT 0x6, lmod, dAs ;dAs will now decide the last frame set-up. || MPY lmod, 4, lmod || [dBs] B empty ;N[01] || ZERO lfInc || XOR fsrp, 4, fsrp ;11-------------------------------------------------------------------------- [!dAs] SUB ndf, 1, ndf ;if lmod <= 6, data frames= evt. frames -1 || [!dAs] MVKL 0x03e8, lfInc || [dAs] SUB lfInc, 0x18, lfInc ;12-------------------------------------------------------------------------- NOP || STW fsrp, *sp0 || MVKL 0x00008000, externOffset ;N[02] ;13-------------------------------------------------------------------------- ;when using 2/cycle: insert here a check on the # of data words per event. If odd, ;subtract 1 from lfInc & check that this last word is a trailer. ADD lmod, lfInc, lfInc ;lmod <= 6: lfInc= 0x400 -4*6 +4*lmod ;lmod > 6: lfInc= 4*lmod -4*6 || STW ndf, *+SP(ndf_stk_o) || ADD iBuff, iLen, iWrapStart || MVKH 0x00008000, externOffset ;N[02] ;14-------------------------------------------------------------------------- SUB xsf, externOffset, xsf ;N[02] ; || ADD xBuff, xLen, xWrapStart || LDW *+hCtrl[invData_o], invPtr ;get pointer to invalid data area ;15-------------------------------------------------------------------------- STW iWrapStart, *+SPA(iWrap_stk_o) || STW xWrapStart, *+SP(xWrap_stk_o) ;16-------------------------------------------------------------------------- STW iLen, *+SPA(iLen_stk_o) ; || STW xLen, *+SP(xLen_stk_o) || ADD iBuff, isf, iBuff || ADD xBuff, xsf, xBuff ;17-------------------------------------------------------------------------- LDW *+SP(hLPtr_stk_o), hLPtr ;recall LUT ptr from stack || MVK -1, ccm ;unphysical value for 1st cc mask ;18-------------------------------------------------------------------------- STW xBuff, *+SPA(xStart_stk_o) ;19-------------------------------------------------------------------------- MVK 1, nfd ;not really true, see N[02] || MVK 0x0100, inc || MV iBuff, sPtr || STW lfInc, *+SP(lfInc_stk_o) ;20-------------------------------------------------------------------------- MV invPtr, hPtrA || MV invPtr, hPtrB || STW invPtr, *+SP(invPtr_stk_o) ;21-------------------------------------------------------------------------- MV sPtr, fdPtr || MV invPtr, hBaseA || MV invPtr, hBase || MVKL 0x2100, mask ;22-------------------------------------------------------------------------- ;23-------------------------------------------------------------------------- ;*************************************************************************************** ;The frame loop has two parts: One set of instructions handle the inter-frame extension ;loop; this creates a periodic departure from the data loop so that any interrupts which ;may be pending can be handled. (Interrupts will pend in the data loop since there is no ;part of the loop without a multi-cycle instruction executing). A second part of the ;loop checks to see if any more frames from the event need processing, and sets up the ;necessary registers (sPtr, ePtr) if so. It is partly merged with the mid-frame pausing ;part. The different parts are marked IF & NF (inter-frame & new frame) to distinguish ;them. If used in both parts, an instruction is marked as I/NF. Though there is no way ;to distinguish in the 1st cycle as the test results are not ready, since all the other ;tests made are used in the new-frame code they are marked that way. frameLoop: CMPLTU sPtr, fdPtr, xB ;frame *not* done (IF) => xB || CMPEQ nfd, ndf, xA ;NF: N[3x] next frame is last frame => xA || LDW *+SPA(iWrap_stk_o), wrapAddr || SUB nfd, 1, nfd ;N[XX] || XOR fsrp, 0xc, fsrp ;toggle fsrp0 & fsxp0 [xB] ADD inc, ePtr, ePtr ;IF: increment the ending pointer. || [!xA] MVK 0x0400, lfInc ;NF: fdInc for any but last frame. || [!xB] MV inc, ePtr ;NF: Do 1/4 frame before pause. || STW fsrp, *sp0 LDW *+SP(iLen_stk_o), wSub || [xB] MVK 1, yB ;IF: set yB so branch to end not done [xB] B prologue ;IF: process more data from frame. || [xB] LDH *+sPtr[1], dataA ;IF: pre-load next dataA || [!xB] CMPLT nfd, ndf, yB ;NF: event done => !yB || [!xB] MV lfInc, fdPtr ;NF: frame done increment CMPGTU ePtr, fdPtr, yB ;I/NF: don't process past last frame word || [xB] LDW *sPtr++, dataB ;IF: pre-load next dataB, see NXX. || [!yB] B end ;NF: all done. [yB] MV fdPtr, ePtr ;I/NF: reset ePtr to last word if needed. || [!xB] STW nfd, *+SPA(nfd_stk_o) ;NF: store new # frames done. MVK 1, yA ;I/NF: set yA so that subtraction not done [!xB] CMPLTU sPtr, wrapAddr, yA ;NF: subtraction not needed => yA [xB] SET mask, 8, 15, mask ;IF: set up mask for 1st loop cycle. || [xB] MPY mask, 1, mask ;IF: mask -> 0x2100 on 2nd cycle || [!yA] SUB sPtr, wSub, sPtr ;NF: perform subtraction if needed. ;********************************************** ;inter-frame loop branches to prologue after instruction above. ADD ePtr, sPtr, ePtr ;set ePtr || ADD fdPtr, sPtr, fdPtr ;set fdPtr || CMPEQ nfd, 0, xA ;is this the 1st event frame? || MVK 0x0028, yA ;load the 1st frame increment into yA [xA] ADD sPtr, yA, sPtr ;1st frame => skip header LDH *+sPtr[1], dataA ;pre-load next dataA LDW *sPtr++, dataB ;pre-load next dataB, see NXX. STW fdPtr, *+SP(fdPtr_stk_o) NOP 2 SET mask, 8, 15, mask || MPY mask, 1, mask ;*************************************************************************************** ;Prologue to data loop: load the 1st word, ready storage pointers, and set up ;so that the 2nd word loads just as data loop begins. Note that dataB is processed ;after dataA. The router words arrive as follows: [1st hw][2nd hw] so the 1st half- ;word to be processed has a higher address than the 2nd half-word. prologue: ;******************** 1 ******************** EXTU dataA, 17, 21, ccA ;get chip & channel from dataA || AND dataA, mask, dA ;dataA & 0xff00 => dA || EXTU dataB, 17, 21, ccB ;get chip & channel from dataB (temp) || AND dataB, mask, dX ;dataB & 0xff00 => dX ;******************** 2 ******************** EXTU dataA, 16, 31, dAa ;is this a data word (MSB == 1)? || SUB dA, mask, dA ;link header? NXX ; || MPY dchanA, ccA, ccA ;get offset from module's base addr. || MV hBase, hBaseA || EXTU dataB, 9, 25, link ;extract module from *dataA* NXX || CMPGTU ePtr, sPtr, dB ;check to see if loop is finished. ; || MPY dchan, ccB, ccB ;get offset from module's base addr. ;******************** 3 ******************** [!dB] B epilogue ;branch executes right before dataLoop || CMPGT 0, dataA, dA ;data word? NXX || [dB] LDH *+sPtr[1], dataA ;LDW for next dataA || [!dA] SET ccm, 0, 31, ccm ;load mask with non-physical value NXX || [!dA] LDW *+hLPtr[link], hBase ;get module histogram base. || CMPEQ dX, mask, dX ;{bits 8-15 == 0x21} => link header. ;******************** 4 ******************** [dA] CMPEQ ccA, ccm, dA || [dB] LDW *sPtr++, dataB ;load entire word, see NXX. || MPY 4, ccA, ccA ;convert to words. || [dA] MV ccA, ccm || EXTU dataB, 25, 25, link ;extract module (if any) from dataB. ;******************** 5 ******************** MPY 1, dAa, sA ;data test -> sA; ready on cycle #7 || [dX] LDW *+hLPtr[link], hBase ;NXX || [dX] NOT dX, ccm ;load mask with non-physical value NXX || EXTU dataB, 16, 31, dB || MPY 4, ccB, ccB ;convert to words: ready in cycle #7. ;******************** 6 ******************** [!dA] ADD ccA, hBaseA, hPtrA ;NXX || [dA] ADD 8, hPtrB, hPtrA || [dB] CMPEQ ccB, ccm, dX || [dB] MV ccB, ccm || [!dB] MPY 1, dB, dX ;NXX ;******************** 7 ******************** AND 1, dataA, thA ;does dataA contain two hits? || AND 1, dataB, thB ;does dataB contain two hits? || [sA] LDW *+hPtrA[0], hDataA0 ;load the dataA, hit 0 histogram data. || MPY 1, dB, sB ;******************** 8 ******************** SET mask, 8, 15, mask || MPY mask, 1, mask || [sA] LDW *+hPtrA[1], hDataA1 ;load the dataA, hit 1 histogram data. ; || [sA] LDW *+hPtrA[dchanA], hDataA1 ;load the dataA, hit 1 histogram data. || [!dX] ADD ccB, hBase, hPtrB || [dX] ADD 8, hPtrA, hPtrB ;*************************************************************************************** ; Interrupts are not allowed during the data loop, as the code it pipelined and ; constantly has multi-cycle operations active. They are given a chance to run by ; periodically branching into an epilogue to the data loop & briefly enabling the ; GIE (Global Interrupt Enable) bit of the Control Status Register. ; ; The data loop is subject to the following constraints: ; a) Data half-word A must be fully processed (for both link headers & data words) ; before link decoding begins on half-word B. If hw A is a link header, the ; histogram base address it corresponds to must be loaded into hPtrB; if it is a ; data half-word and contains the same chip & channel address as hw B, hPtrB must ; be incremented using hPtrA. Similar considerations apply going from B -> A, ; when looping to the next data word, but here the constraints are not as tight. ; Note that for the tight loop (8 cycles) this forces us to use an auxiliary hBase ; in the opposite register file to avoid cross-path conflicts. ; b) The loop has two sections which are computed in parallel: a computation sub-loop ; decodes the next word while the histogramming sub-loop stores the previously ; computed word. The computation sub-loop must finish on its half-word before the ; histogramming sub-loop begins, and it must not modify the variables needed by ; the histogramming sub-loop until they are no longer needed. Typically this ; involves delaying the assertion of a new value in registers by using multi- ; plication, and temporary storage in other registers. ; c) Since it is a two-part loop, the loop only operates when there are > 1 data ; words; a prologue & epilogue handle computation for the 1st histogram storage, ; and storage of the last word. dataLoop: ;******************** 1 ******************** DM/M EXTU dataA, 17, 21, ccA ;get chip & channel from dataA || AND dataA, mask, dA ;dataA & 0xff00 => dA || EXTU dataB, 17, 21, ccB ;get chip & channel from dataB (temp) || AND dataB, mask, dX ;dataB & 0xff00 => dX || [sB] LDW *+hPtrB[0], hDataB0 ;load dataB, hit 0 histogram data NXX ;******************** 2 ******************** -/- EXTU dataA, 16, 31, dAa ;is this a data word (MSB == 1)? || SUB dA, mask, dA ;link header? NXX ; || MPY dchanA, ccA, ccA ;get offset from module's base addr. || MV hBase, hBaseA || EXTU dataB, 9, 25, link ;extract module from *dataA* NXX || CMPGTU ePtr, sPtr, dB ;check to see if loop is finished. ; || MPY dchan, ccB, ccB ;get offset from module's base addr. || [sB] LDW *+hPtrB[1], hDataB1 ;load the dataB, hit 1 histogram data. ; || [sB] LDW *+hPtrB[dchan], hDataB1 ;load the dataB, hit 1 histogram data. ;******************** 3 ******************** M/M [dB] B dataLoop ;branch will execute after cycle #8 || CMPGT 0, dataA, dA ;data word? NXX || [dB] LDH *+sPtr[1], dataA ;LDW for next dataA || [!dA] SET ccm, 0, 31, ccm ;load mask with non-physical value NXX || [!dA] LDW *+hLPtr[link], hBase ;get module histogram base. || CMPEQ .2X dX, mask, dX ;{bits 8-15 == 0x21} => link header. ;******************** 4 ******************** -/(L or D)M [dA] CMPEQ ccA, ccm, dA || [dB] LDW *sPtr++, dataB ;load entire word, see NXX. || [sA] ADD 1, hDataA0, hDataA0 || MPY 4, ccA, ccA ;convert to words. || [dA] MV ccA, ccm || EXTU dataB, 25, 25, link ;extract module (if any) from dataB. ;******************** 5 ******************** -/- MPY 1, dAa, sA ;data test -> sA; ready on cycle #7 || [sA] ADD 1, hDataA1, hDataA1 || [sA] STW hDataA0, *+hPtrA[0] || [sA] AND sA, thA, sA ;sA now only true if 2 hit data word. || [dX] LDW *+hLPtr[link], hBase ;NXX || [dX] NOT dX, ccm ;load mask with non-physical value NXX || EXTU dataB, 16, 31, dB || MPY 4, ccB, ccB ;convert to words: ready in cycle #7. ;******************** 6 ******************** M/- [!dA] ADD ccA, hBaseA, hPtrA ;NXX || [dA] ADD 8, hPtrB, hPtrA || [sA] STW hDataA1, *+hPtrA[1] ; || [sA] STW hDataA1, *+hPtrA[dchanA] || [dB] CMPEQ ccB, ccm, dX || [dB] MV ccB, ccm || [!dB] MPY 1, dB, dX ;NXX || [sB] ADD 1, hDataB0, hDataB0 ;******************** 7 ******************** M/- AND 1, dataA, thA ;does dataA contain two hits? || AND 1, dataB, thB ;does dataB contain two hits? || [sA] LDW *+hPtrA[0], hDataA0 ;load the dataA, hit 0 histogram data. || [sB] ADD 1, hDataB1, hDataB1 || [sB] STW hDataB0, *+hPtrB[0] || [sB] AND sB, thB, sB || MPY 1, dB, sB ;******************** 8 ******************** L/M SET mask, 8, 15, mask || MPY mask, 1, mask || [sA] LDW *+hPtrA[1], hDataA1 ;load the dataA, hit 1 histogram data. ; || [sA] LDW *+hPtrA[dchanA], hDataA1 ;load the dataA, hit 1 histogram data. || [!dX] ADD ccB, hBase, hPtrB || [dX] ADD 8, hPtrA, hPtrB || [sB] STW hDataB1, *+hPtrB[1] ; || [sB] STW hDataB1, *+hPtrB[dchan] ;*************************************************************************************** ;Epilogue to data loop: store the last word, ready pointers for next frame (if it ;exists & needs processing). The data-loop instructions are marked with a DL. Most ;instructions in the epilogue are incompressible-- i.e. they are scheduled for a ;particular slot based upon instructions in the data loop above. A few frame loop ;registers are loaded in some of the (many) spare instructions. More additions are ;pointless-- see the frame loop description for details. The exceptions are the nfd ;addition and setting inc, which help the frame loop decide quickly on how to load ;the ending pointer. The epilogue is extended for a few instructions to allow any pending ;interrupts to run; see the frame loop header for details. epilogue: ;******************** 1 ******************** [sB] LDW *+hPtrB[0], hDataB0 ;******************** 2 ******************** [sB] LDW *+hPtrB[1], hDataB1 ; [sB] LDW *+hPtrB[dchan], hDataB1 ;******************** 3 ******************** NOP ;******************** 4 ******************** [sA] ADD 1, hDataA0, hDataA0 ;******************** 5 ******************** [sA] ADD 1, hDataA1, hDataA1 || [sA] STW hDataA0, *+hPtrA[0] || [sA] AND sA, thA, sA ;******************** 6 ******************** [sA] STW hDataA1, *+hPtrA[1] ; [sA] STW hDataA1, *+hPtrA[dchanA] || [sB] ADD 1, hDataB0, hDataB0 ;******************** 7 ******************** [sB] ADD 1, hDataB1, hDataB1 || [sB] STW hDataB0, *+hPtrB[0] || [sB] AND sB, thB, sB ;******************** 8 ******************** [sB] STW hDataB1, *+hPtrB[1] ; [sB] STW hDataB1, *+hPtrB[dchan] MV SP, SPA || LDW *+SP(nfd_stk_o), nfd ;******************** 2 ******************** MVC CSR, csrReg OR 1, csrReg, csrReg MVC csrReg, CSR NOP 8 MVC CSR, csrReg AND -2, csrReg, csrReg MVC csrReg, CSR ;******************** 2a ******************** LDW *+SPA(nif_stk_o), nif || LDW *+SP(ndf_stk_o), ndf ;******************** 3 ******************** LDW *+SP(fdPtr_stk_o), fdPtr || B frameLoop ;branch will execute after cycle #8 ;******************** 4 ******************** ; LDW *+SPA(xStart_stk_o), xStart LDW *+SP(lfInc_stk_o), lfInc ;******************** 5 ******************** NOP ;******************** 6 ******************** NOP ;******************** 7 ******************** NOP ;******************** 8 ******************** ADD 2, nfd, nfd ;N[02a] || MVK 0x0100, inc ;N[03] ;******************** 9 ******************** ;*************************************************************************************** ;end of function: empty: end: MV SP, SPA || MVK 4, mask NOT mask, mask AND mask, fsrp, fsrp STW fsrp, *sp0 LDW .D2T2 *+SP(b3_stk_o),B3 LDW .D2T2 *+SP(b10_stk_o),B10 || LDW .D1T1 *+SPA(a10_stk_o),A10 LDW .D2T2 *+SP(b11_stk_o),B11 || LDW .D1T1 *+SPA(a11_stk_o),A11 LDW .D2T2 *+SP(b12_stk_o),B12 || LDW .D1T1 *+SPA(a12_stk_o),A12 MVK delta_stk, SPA || LDW .D2T2 *+SP(b13_stk_o),B13 || LDW .D1T1 *+SPA(a13_stk_o),A13 ADD SPA, SP, SP || LDW .D2T1 *+SP(a14_stk_o),A14 LDW .D2T1 *SP,SPA ;should allow for interrupt at function end (4 cycles after last LDW): B .S2 B3 MVC CSR, csrReg OR 1, csrReg, csrReg MVC csrReg, CSR NOP 2 ; || ADD SPA, SP, SP ; || LDW .D2T1 *+SP(a14_stk_o),A14 ; LDW .D2T1 *SP,SPA ; LDW .D2T1 *++SP(delta_stk),SPA ; NOP 4 ;*************************************************************************************** ;Notes: ; ; N[00]: Loaded registers are ready after 5 CPU cycles, including the load instruction. ; While waiting, load any needed masks & other registers. Only 2 LDW instructions ; can be issued in a given cycle, and they must load into different register banks ; (either A or B), using different source banks. See the TI CCS on-line help, in the ; "instruction set summary"/"shared instructions"/"resource constraints" section for ; details. ; ; N[xx]: lMod is used to decide whether or not the last data frame is in the last event ; frame (lMod > 6) or in the frame right before it. (If lMod > 6, then there is ; at least one data word in the last data frame). ; ; N[xy] The lfInc variable initially stores the amount that the end-pointer should be ; decremented by to point to the last data word on the last frame being processed ; (the extra 6 words account for the trailer). Then this is converted to the ; maximum increment of the start pointer in the last data frame. ; ;N[01]: Routine exits if an empty event or only headers & trailers (Pixel). ; ;N[02]: The # data frames done is incremented here for the comparison in the frame loop ; which checks if the next frame is the last data frame (i.e. nfd == ndf -1). ; The 1 is thrown over to nfd. The 2 arises because nfd is incremented here as ; well if the frame is finished (also see note 02a below). ; ;N[02a]: See note 2 above; if the frame actually is done here, note 2 applies. Once the ; test has been made, the true value of nfd is restored by the subtraction. ; If this instruction is reached because the loop extended to allow any pending ; interrupts to happen, the test & addition will have no effect because the test ; will be disregarded, and the new value will not be stored by the frame loop. ; When the routine arrives here once the last event frame has been done it does ; not matter that nfd is now greater than ndf; the function will end. ;N[03]: This increment will be added to the ending pointer if the frame is not done. ;N[XX]: data words have bit #31 (MSB) set, and in the computer's representation of ; integers (2's complement) this is a negative number. ;N[XX]: Column pairs in the same stage (for ex. in stage 0 of 32) have hits on: ; col. 0, row {0, 32, 64, 96, 128} & col. 1, row {159, 127, 95, 63, 31}. If left ; unchanged this would go into 2 different hashes and thus double cache occupancy. ; Inverting an odd column's output places both into the same hash. ;N[yz] Since the mask shares with dA, when the header checking loop is entered the ; mask is guaranteedd to be zero already. The 2nd mask will test the masked off ; field to check for a header. ; ;address & control regs. were computed ;in the last cycle. ;(needed later after dA re-used) since LDH sign-extends the ;loaded half word, this computation will still work. ;if this data-word is not a repeat, ;load the pointer using the new offset. ;note that if this is a link header, ;the pointer will be re-loaded again ;soon anyway, so it doesn't matter. ;otherwise, we are in a cluster => +2 ;This and the following line ensure that ;for cycle #1 mask = 0xff00, but in ;cycle 2 the value jumps back to 0x2100 ;for the next link header compare. ;ensure that dB gets translated to dX under all conditions. ;EXTU forces the two source & destination register to be on the same side. Since there ;are only three cycles to do this in (8 of the previous cycle, and 1 & 2 of the current ;one), dataA would need to be copied over to the B side in one of the two earlier cycles; ;however they cannot be used since the cross paths are already taken by other (critical) ;variables. Hence instead the entire word is actually copied in dataB, and the module ;number is obtained from the otherwise unused high half-word (A). Note this means that ;the alternate method of deciding whether dataB represents hit data (is it less than 0?) ;will not work. ;Checking for a link header is done by 1st ANDing the data with 0xff00 to extract the ;link header field from the router, and then checking to see that it equals 0x2100. ;This can be done by either comparing them, or more generally (but less clearly) by ;subtracting them and using the logical NOT of the resulting value. The compare ;instruction uses either the L or S units, while subtraction uses L, S, or D. The ;comparison for dataA is done using the latter method, since the hBase computed in ;the previous cycle (A or B) if there was a link header must be placed in hBaseA. ;The move instruction can normally use any of the L, S, or D units, but if it is ;copying a B side register to side A, it cannot use the D unit; this would generate ;a conflict if CMPEQ is used as well. If the hBase was generated using the previous ;dataB, this is the earliest (and only) opportunity to transfer. ;The transfer of hBase to side A (hBaseA) must occur before cycle 6 of the next ;cycle; however all the available cross-paths or instructions/side are used in ;cycles 3, 4 and 5. It's impossible to force the extraction of the data B module # ;earlier than cycle 4 without running into unit conflicts or conflicting with the ;side A module extraction, thus cycle 5 is the earliest possible hBase computation, ;and cycle 2 of the next cycle is the only possible cycle in which to copy hBase ;over to side A. ;since (new) thA and thB can be computed only after the storage loop has used ;them (and as the middle A cycles are all full, cycle 7 is the 1st available cycle for ;the thA comp.), the lifetime of dataA & dataB must extend to cycle 7. ;*************************************************************************************** ; preamble: zero out all registers except the inputs, then subtract 1 to get a good ; marker for unchanged variables. (This is used during simulation). preamble: MVKL 0xcafecafe, b0 MVKH 0xcafecafe, b0 MV b0, a0 || STW .D2T2 B0, *+SP(ndf_stk_o) ;storage for ndf STW .D1T1 A0,*+SPA(nfd_stk_o) ;storage for nfd || STW .D2T2 B0, *+SP(fdPtr_stk_o) ;storage for fdPtr STW .D1T1 A0,*+SPA(invPtr_stk_o) ;storage for invPtr || STW .D2T2 B0, *+SP(lfInc_stk_o) ;storage for lfInc STW .D1T1 A0,*+SPA(hLPtr_stk_o) ;storage for LUT pointer || STW .D2T2 B0, *+SP(x_stk_o) ;storage for misc STW .D1T1 A0,*+SPA(hLSave_stk_o) ;storage for saved link || STW .D2T2 B0, *+SP(iLen_stk_o) ;storage for iLen STW .D1T1 A0, *+SPA(iWrap_stk_o) ;storage for iWrap || STW .D2T2 B0, *+SP(x1_stk_o) ;storage for misc1 ZERO a0 || ZERO a1 || ZERO a2 || MPY 0, a3, a3 || ZERO b0 || ZERO b1 || ZERO b2 || MPY 0, b3, b3 ; ZERO a4 ; input (event data pointer) ZERO a5 || ZERO a6 || MPY 0, a7, a7 || ZERO b4 || ZERO b5 || ZERO b6 || MPY 0, b7, b7 ZERO a8 || ZERO a9 || ZERO a10 || MPY 0, a11, a11 || ZERO b8 || ZERO b9 || ZERO b10 || MPY 0, b11, b11 ZERO a12 || ZERO a13 || ZERO a14 ; || MPY 0, a15, a15 || ZERO b12 || ZERO b13 SUB a0, 1, a0 || SUB a1, 1, a1 || SUB a2, 1, a2 || SUB b0, 1, b0 || SUB b1, 1, b1 || SUB b2, 1, b2 SUB a3, 1, a3 ; || SUB a4, 1, a4 || SUB a5, 1, a5 || SUB b3, 1, b3 || SUB b4, 1, b4 || SUB b5, 1, b5 || B preambleRet SUB a6, 1, a6 SUB a7, 1, a7 || SUB a8, 1, a8 || SUB b6, 1, b6 || SUB b7, 1, b7 || SUB b8, 1, b8 SUB a9, 1, a9 || SUB a10, 1, a10 || SUB a11, 1, a11 || SUB b9, 1, b9 || SUB b10, 1, b10 || SUB b11, 1, b11 SUB a12, 1, a12 || SUB a13, 1, a13 || SUB a14, 1, a14 || SUB b12, 1, b12 || SUB b13, 1, b13 ; SUB a15, 1, a15 NOP 2