I try to otpimize the following code. But the optimized code is far from efficiency. The software pipeline is inhitied by The Loop Carried Dependency Bound(^) . Length can be divided by 4.
Orignal codes:
void Rotate_Input1(float * restrict output, float a, unsigned int Length)
{
float In0,In1,In2,Out0,Out1,Out2;
int n;
In1 = 0;
In0 = 0;
Out1 = 0;
Out0 = 0;
for( n = 0; n < Length; n++ )
{
In2 = In1;
In1 = In0;
In0 = output[n];
Out2 = Out1;
Out1 = Out0;
Out0 = a * (In0 + Out2) - In2;
output[n] = Out0;
}
}
;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop found in file : ../main.c
;* Loop source line : 10
;* Loop opening brace source line : 11
;* Loop closing brace source line : 23
;* Known Minimum Trip Count : 1
;* Known Max Trip Count Factor : 1
;* Loop Carried Dependency Bound(^) : 7
;* Unpartitioned Resource Bound : 2
;* Partitioned Resource Bound(*) : 2
;* Resource Partition:
;* A-side B-side
;* .L units 0 0
;* .S units 0 0
;* .D units 1 1
;* .M units 1 0
;* .X cross paths 1 1
;* .T address paths 2* 0
;* Long read paths 0 0
;* Long write paths 0 0
;* Logical ops (.LS) 2 0 (.L or .S unit)
;* Addition ops (.LSD) 2 2 (.L or .S or .D unit)
;* Bound(.L .S .LS) 1 0
;* Bound(.L .S .D .LS .LSD) 2* 1
;*
;* Searching for software pipeline schedule at ...
;* ii = 7 Schedule found with 3 iterations in parallel
;*
;* Register Usage Table:
;* +-----------------------------------------------------------------+
;* |AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA|BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB|
;* |00000000001111111111222222222233|00000000001111111111222222222233|
;* |01234567890123456789012345678901|01234567890123456789012345678901|
;* |--------------------------------+--------------------------------|
;* 0: | **** * * | |
;* 1: | ***** ** | |
;* 2: | *** ** ** | |
;* 3: | *** * ** | |
;* 4: | *** * ** | |
;* 5: | **** * ** | |
;* 6: | *** ** ** | |
;* +-----------------------------------------------------------------+
;*
;* Done
;*
;* Loop will be splooped
;* Collapsed epilog stages : 0
;* Collapsed prolog stages : 0
;* Minimum required memory pad : 0 bytes
;*
;* Minimum safe trip count : 1
;* Min. prof. trip count (est.) : 2
;*
;* Mem bank conflicts/iter(est.) : { min 0.000, est 0.000, max 0.000 }
;* Mem bank perf. penalty (est.) : 0.0%
;*
;*
;* Total cycles (est.) : 14 + trip_cnt * 7
The opimized code(not efficient enough):
void Rotate_Input2(float * restrict output, float a, unsigned int Length)
{
float In0,In1,Out0,Out1;
int n;
double x1x0, x3x2;
float temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
In1 = 0;
In0 = 0;
Out1 = 0;
Out0 = 0;
_nassert(Length % 4 == 0);
_nassert(Length > 0);
_nassert((int)output % 8 == 0);
#pragma MUST_ITERATE(4,,)
for( n = 0; n < Length; n += 4 )
{
x1x0 = _amemd8((void*)(output + n));
x3x2 = _amemd8((void*)(output + n + 2));
temp1 = _itof(_lo(x1x0)) + Out1;
temp2 = _itof(_hi(x1x0)) + Out0;
temp3 = a * _itof(_lo(x3x2));
temp4 = a * _itof(_hi(x3x2));
temp5 = temp1 * a;
temp6 = temp2 * a;
temp7 = temp3 - _itof(_lo(x1x0));
temp8 = temp4 - _itof(_hi(x1x0));
temp1 = temp5 - In1;
temp2 = temp6 - In0;
temp3 = temp1 * a;
temp4 = temp2 * a;
In1 = _itof(_lo(x3x2));
In0 = _itof(_hi(x3x2));
Out1 = temp7 + temp3;
Out0 = temp8 + temp4;
output[n] = temp1;
output[n + 1] = temp2;
output[n + 2] = Out1;
output[n + 3] = Out0;
}
}
;* SOFTWARE PIPELINE INFORMATION
;*
;* Loop found in file : ../main.c
;* Loop source line : 43
;* Loop opening brace source line : 44
;* Loop closing brace source line : 77
;* Known Minimum Trip Count : 4
;* Known Max Trip Count Factor : 1
;* Loop Carried Dependency Bound(^) : 20
;* Unpartitioned Resource Bound : 4
;* Partitioned Resource Bound(*) : 5
;* Resource Partition:
;* A-side B-side
;* .L units 0 0
;* .S units 1 0
;* .D units 4 0
;* .M units 3 3
;* .X cross paths 2 3
;* .T address paths 2 2
;* Long read paths 0 0
;* Long write paths 0 0
;* Logical ops (.LS) 4 4 (.L or .S unit)
;* Addition ops (.LSD) 4 4 (.L or .S or .D unit)
;* Bound(.L .S .LS) 3 2
;* Bound(.L .S .D .LS .LSD) 5* 3
;*
;* Searching for software pipeline schedule at ...
;* ii = 20 Schedule found with 2 iterations in parallel
Can you give me some adivces to reduce the Loop Carried Dependency Bound(^) and improve the performance? Thanks!