One aside we can make about this is that when structured programming constructs were introduced, one of the reasons given by many programmers for resisting the structured programming methodology was efficiency: they felt they could write code that ran more quickly if they didn't pay attention to structured programming concepts. With current complex pipelines and highly optimizing compilers, the compiler can normally do a better job of producing efficient code than a human can even in assembly language, and using structured programming techniques makes it easier for the compiler to do this job.
In this example, there are three basic blocks: initialization code, the termination condition (which is located at the beginning of the loop), and the loop body and index increment code.
xor r1, r1, r1 ; clear r1
lhi r2, i >> 16
sw i & 0xffff(r2) r1 ; store r1 to i
beqz r0, loop
body: ; get a[i]
lhi r8, a>>16 ; first get address of a into r9
addi r9, r8, a & 0xffff
lhi r10, i >> 16 ; now get index into r12
lw r11, i & 0xffff(r10)
slli r12, r11, 2
add r13, r9, r12 ; load a[i] into r14
lw r14, 0(r13)
; get b[i]
lhi r15, b>>16 ; first get address of b into r16
addi r16, r15, b & 0xffff
lhi r17, i >> 16 ; get index into r19
lw r18, i & 0xffff(r17)
slli r19, r18, 2
add r20, r16, r19 ; load b[i] into r21
lw r21, 0(r20)
add r22, r14, r21 ; add a[i] + b[i]
; store c[i]
lhi r23, c>>16 ; get address of c into r24
addi r24, r23, c & 0xffff
lhi r25, i >> 16 ; get index into r27
lw r26, i & 0xffff(r25)
slli r27, r26, 2
add r28, r24, r27 ; store r22 to c[i]
sw 0(r28), r22
; increment i
lhi r29, i >> 16 ; load i
lw r30, i & 0xffff(r29)
addi r31, r30, 1 ; add 1
lhi r32, i << 16 ; store i
sw i & 0xffff(r32), r31
beqz r0 loop
loop:
lhi r3, i >> 16 ; fetch i into r4
lw r4, i & 0xffff(r3)
lhi r5, arrsize && 16 ; fetch arrsize into r6
lw r6, arrsize & 0xffff(r5),
slt r7, r4, r6 ; compare r4 to r6
bnez r7, body ; enter loop body if compare said no.
beqz r0, endloop ; jump around loop body if you didn't enter
endloop:
(we can then eliminate two useless unconditional branches)
xor r1, r1, r1 ; clear r1
lhi r2, i >> 16
sw i & 0xffff(r2) r1 ; store r1 to i
lhi r3, i >> 16 ; fetch i into r4
lhi r5, arrsize && 16 ; fetch arrsize into r6
lw r6, arrsize & 0xffff(r5),
lhi r8, a>>16 ; first get address of a into r9
addi r9, r8, a & 0xffff
lhi r10, i >> 16 ; now get index into r12
lhi r15, b>>16 ; first get address of b into r16
addi r16, r15, b & 0xffff
lhi r17, i >> 16 ; get index into r19
lhi r23, c>>16 ; get address of c into r24
addi r24, r23, c & 0xffff
lhi r25, i >> 16 ; get index into r27
lhi r29, i >> 16 ; load i
lhi r32, i << 16 ; store i
beqz r0, loop
body: ; get a[i]
lw r11, i & 0xffff(r10)
slli r12, r11, 2
add r13, r9, r12 ; load a[i] into r14
lw r14, 0(r13)
; get b[i]
lw r18, i & 0xffff(r17)
slli r19, r18, 2
add r20, r16, r19 ; load b[i] into r21
lw r21, 0(r20)
add r22, r14, r21 ; add a[i] + b[i]
; store c[i]
lw r26, i & 0xffff(r25)
slli r27, r26, 2
add r28, r24, r27 ; store r22 to c[i]
sw 0(r28), r22
; increment i
lw r30, i & 0xffff(r29)
addi r31, r30, 1 ; add 1
sw i & 0xffff(r32), r31
loop:
lw r4, i & 0xffff(r3)
slt r7, r4, r6 ; compare r4 to r6
bnez r7, body ; enter loop body if compare said no.
endloop:
xor r1, r1, r1 ; clear r1
lhi r2, i >> 16
sw i & 0xffff(r2) r1 ; store r1 to i
lhi r5, arrsize && 16 ; fetch arrsize into r6
lw r6, arrsize & 0xffff(r5),
lhi r8, a>>16 ; first get address of a into r9
addi r9, r8, a & 0xffff
lhi r15, b>>16 ; first get address of b into r16
addi r16, r15, b & 0xffff
lhi r23, c>>16 ; get address of c into r24
addi r24, r23, c & 0xffff
beqz r0, loop
body: ; get a[i]
slli r12, r1, 2
add r13, r9, r12 ; load a[i] into r14
lw r14, 0(r13)
; get b[i]
add r20, r12, r16 ; load b[i] into r21
lw r21, 0(r20)
add r22, r14, r21 ; add a[i] + b[i]
; store c[i]
add r28, r12, r24 ; store r22 to c[i]
sw 0(r28), r22
; increment i
addi r31, r1, 1 ; add 1
sw i & 0xffff(r2), r31
loop:
slt r7, r2, r6 ; compare r4 to r6
bnez r7, body ; enter loop body if compare said no.
endloop:
(another bit of factoring would move the store of i out past the bottom of the loop).
loop:
loop body(i)
i = i - 1;
if (!(terminate(i)) goto loop
and transform it to
loop:
loop body(i)
loop body(i-1)
i = i - 2;
if (!(terminate(i)) goto loop
This has two advantages:
i at the end. This is likely
to be difficult for the compiler to generate.
In the example, unrolling the loop once (so we combine two iterations into a single iteration), we get:
xor r1, r1, r1 ; clear r1
lhi r2, i >> 16
sw i & 0xffff(r2) r1 ; store r1 to i
lhi r5, arrsize && 16 ; fetch arrsize into r6
lw r6, arrsize & 0xffff(r5),
lhi r8, a>>16 ; first get address of a into r9
addi r9, r8, a & 0xffff
lhi r15, b>>16 ; first get address of b into r16
addi r16, r15, b & 0xffff
lhi r23, c>>16 ; get address of c into r24
addi r24, r23, c & 0xffff
beqz r0, loop
body: ; get a[i]
slli r12, r1, 2
add r13, r9, r12 ; load a[i] into r14
lw r14, 0(r13)
; get b[i]
add r20, r12, r16 ; load b[i] into r21
lw r21, 0(r20)
add r22, r14, r21 ; add a[i] + b[i]
; store c[i]
add r28, r12, r24 ; store r22 to c[i]
sw 0(r28), r22
; get a[i+1]
add r13, r9, r12 ; load a[i] into r14
lw r14, 4(r13)
; get b[i]
add r20, r12, r16 ; load b[i] into r21
lw r21, 4(r20)
add r22, r14, r21 ; add a[i] + b[i]
; store c[i]
add r28, r12, r24 ; store r22 to c[i]
sw 4(r28), r22
; increment i
addi r31, r1, 2 ; add 1
loop:
slt r7, r2, r6 ; compare r4 to r6
bnez r7, body ; enter loop body if compare said no.
endloop:
sw i & 0xffff(r2), r31
The basic approach we will take to scheduling the instructions is to detect the dependences between the instructions, so we can shuffle instructions in the instruction stream.