Maybe the vmv. x.s instruction seems to have problems in certain specific situations #241

LeleJun97 · 2023-08-24T14:07:34Z

My native language is not English, and there may be some issues with my expression. I apologize to you.
This issue was discovered when I tested the assembly code generated by automatic vectorization using riscv32-unknown-elf-gcc.

Fistly
My C code is as follows：

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <stdint.h>

void matrix_mul_vect(uint32_t N, uint32_t *C, uint16_t *A, uint16_t *B) {
	uint32_t i,j;
	for (i=0; i<N; i++) {
		C[i]=0;
		for (j=0; j<N; j++) {
			C[i]+=(uint32_t)A[i*N+j] * (uint32_t)B[j];
		}
	}
}

Because I am not proficient in assembly writing, I generate an automatic vectorization "matrix multiplication vector" function written using RISC-V V assembly.

Secondly I use commands
“riscv32-unknown-elf-gcc -S -O3 -march=rv32imacv matrix_mul_vect.c”
to generate. S file:

        .file	"matrix_mul_vect.c"
	.option nopic
	.attribute arch, "rv32i2p0_m2p0_a2p0_f2p0_d2p0_c2p0_v1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0"
	.attribute unaligned_access, 0
	.attribute stack_align, 16
	.text
	.align	1
	.globl	matrix_mul_vect
	.type	matrix_mul_vect, @function
matrix_mul_vect:
	beq	a0,zero,.L1
	slli	t4,a0,2
	csrr	t1,vlenb
	slli	t5,a0,1
	add	t4,a1,t4
	srai	t1,t1,1
	vsetvli	a5,zero,e32,m1,ta,mu
.L4:
	mv	a7,a3
	mv	a6,a2
	mv	a5,a0
	vmv.v.i	v25,0
.L3:
	vsetvli	t3,a5,e32,m1,ta,mu
	vle16.v	v24,(a6)
	vle16.v	v27,(a7)
	vzext.vf2	v26,v24
	vzext.vf2	v24,v27
	vsetvli	a4,zero,e32,m1,ta,mu
	vmul.vv	v24,v24,v26
	vsetvli	zero,a5,e32,m1,tu,mu
	sub	a5,a5,t3
	add	a7,a7,t1
	add	a6,a6,t1
	vadd.vv	v25,v24,v25
	bne	a5,zero,.L3
	vsetvli	a5,zero,e32,m1,ta,mu
	addi	a1,a1,4
	vmv.s.x	v24,zero
	vredsum.vs	v24,v25,v24
	vmv.x.s	a5,v24
	sw	a5,-4(a1)
	add	a2,a2,t5
	bne	a1,t4,.L4
.L1:
	ret
	.size	matrix_mul_vect, .-matrix_mul_vect
	.ident	"GCC: (g) 12.0.1 20220505 (prerelease)"

Finally, I put the compiled assembly into the test file.

#include <stdint.h>
#include <string.h>
#include "printf.h"
#include "runtime.h"
#include "util.h"

**extern void matrix_mul_vect(uint32_t N, uint32_t *C, uint16_t *A, uint16_t *B);**

void printmat(uint16_t *A, uint32_t N, char *name) {
	uint32_t i,j;
	printf("Matrix %s [%dx%d]:\n",name,N,N);
	for (i=0; i<N; i++) {
		for (j=0; j<N; j++) {
			if (j!=0)
				printf(",");
			printf("%d",A[i*N+j]);
		}
		printf("\n");
	}
}

void printmatC(uint32_t *C, uint32_t N, char *name) {
	uint32_t i,j;
	printf("Matrix %s [%dx%d]:\n",name,N,N);
	for (i=0; i<N; i++) {
		for (j=0; j<N; j++) {
			if (j!=0)
				printf(",");
			printf("%d",C[i*N+j]);
		}
		printf("\n");
	}
}

void printnumb(uint16_t *numb, uint32_t N) {
  uint32_t i;
  for (i = 0; i < N; i++) {
    printf("%d ", numb[i]);
  }
  printf("\n");
}

void printnumbC(uint32_t *numb, uint32_t N) {
  uint32_t i;
  for (i = 0; i < N; i++) {
    printf("%d ", numb[i]);
  }
  printf("\n");
}

void printnumbXC(uint32_t *numb, uint32_t N) {
  uint32_t i;
  for (i = 0; i < N; i++) {
    printf("%x ", numb[i]);
  }
  printf("\n");
}

int main(int argc, char* argv[])
{
	uint16_t x[9][9] = {20,23,29,49,147,237,207,161,163,
						37,39,41,43,45,47,49,51,53,
						55,57,59,61,63,65,67,69,71,
						73,75,77,79,81,83,85,87,89,
						91,93,95,97,99,101,103,105,107,
						109,111,113,115,117,119,121,123,125,
						127,129,131,133,135,137,139,141,143,
						145,147,149,151,153,155,157,159,161,
						163,165,167,169,171,173,175,177,179,
};
	uint16_t y[9] = {2, 4, 9, 28, 125, 726, 5047, 40328, 35209};
	uint32_t z[9];
	printmat(x,9,"X");
	printf("Y\n");
	printnumb(y, 9);
    matrix_mul_vect(9, z,x, y);
	printf("Z\n");
	printnumbC(z,9);
	printf("XZ\n");
	printnumbXC(z,9);
	return 0;
}

Normally, it should print out a vector, but it doesn't.
I noticed he got stuck in vmv.x.s.
I also tried a lot of code using automatic vectorization, and they all encountered the same problem as above.

Why???

Thank you for seeing this. If you could help me solve the problem, that would be even better!

The text was updated successfully, but these errors were encountered:

LeleJun97 · 2023-08-25T09:33:51Z

I am quite anxious about this issue, and I would greatly appreciate it if you could help me take a look!

978716899 · 2023-08-29T02:49:31Z

Yes, like you, I also encountered this problem. I wrote the LeNet-5 program in C language, and when gcc -O3 automatic vectorization was enabled, the vmv.x.s instruction would get stuck.

NORMAL_CONVOLUTION_FORWARD_int(features_int->input,  features_int->layer1, lenet_int->weight0_1, lenet_int->bias0_1, action);
  8000063a:	4f01                	li	t5,0
  8000063c:	00538cb3          	add	s9,t2,t0
  80000640:	000e2b83          	lw	s7,0(t3)
  80000644:	01cc8b33          	add	s6,s9,t3
  80000648:	88a2                	mv	a7,s0
  8000064a:	00fa8eb3          	add	t4,s5,a5
  8000064e:	00db0d33          	add	s10,s6,a3
  80000652:	c502f057          	vsetivli	zero,5,e32,m1,ta,mu
  80000656:	020d6c87          	vle32.v	v25,(s10)
  8000065a:	00de8d33          	add	s10,t4,a3
  8000065e:	020d6c07          	vle32.v	v24,(s10)
  80000662:	05007c57          	vsetvli	s8,zero,e32,m1,ta,mu
  80000666:	979c2cd7          	vmul.vv	v25,v25,v24
  8000066a:	c102f057          	vsetivli	zero,5,e32,m1,tu,mu
  8000066e:	9fa03c57          	vmv1r.v	v24,v26
  80000672:	03903c57          	vadd.vi	v24,v25,0
  80000676:	01077057          	vsetvli	zero,a4,e32,m1,tu,mu
  8000067a:	cf09                	beqz	a4,80000694 <normal_Predict_int+0x13a>
  8000067c:	020b6c87          	vle32.v	v25,(s6)
  80000680:	020eed87          	vle32.v	v27,(t4)
  80000684:	05007c57          	vsetvli	s8,zero,e32,m1,ta,mu
  80000688:	979dacd7          	vmul.vv	v25,v25,v27
  8000068c:	01077057          	vsetvli	zero,a4,e32,m1,tu,mu
  80000690:	039c0c57          	vadd.vv	v24,v25,v24
  80000694:	05007c57          	vsetvli	s8,zero,e32,m1,ta,mu
  80000698:	0ed1                	addi	t4,t4,20
  8000069a:	42006cd7          	vmv.s.x	v25,zero
  8000069e:	080b0b13          	addi	s6,s6,128
  800006a2:	038cacd7          	vredsum.vs	v25,v24,v25
  800006a6:	43902c57          	vmv.x.s	s8,v25
  800006aa:	9be2                	add	s7,s7,s8
  800006ac:	fbbe91e3          	bne	t4,s11,8000064e <normal_Predict_int+0xf4>

mp-17 · 2023-09-06T20:54:51Z

Hello, thanks a lot for reporting!
I also had a look at this friendlier example (#246), but I can run it on both Verilator and QuestaSim from the main branch.

Can you please provide a similar example that easily reproducible, so that I can fix the bug?

For example, a main.c with the sequence of failing instructions, the branch you are working on, and which simulator you are using.

Thank you,
Matteo

mp-17 · 2024-08-12T12:44:47Z

Let me know if #342 fixes the issue!

LeleJun97 mentioned this issue Aug 29, 2023

vfmv.f.s and vmv.x.s are supported? #145

Closed

LeleJun97 mentioned this issue Sep 4, 2023

Issues related to the use of vmv and vredsum. #246

Closed

mp-17 added the bug Something isn't working label Sep 6, 2023

mp-17 closed this as completed Aug 12, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Maybe the vmv. x.s instruction seems to have problems in certain specific situations #241

Maybe the vmv. x.s instruction seems to have problems in certain specific situations #241

LeleJun97 commented Aug 24, 2023 •

edited

Loading

LeleJun97 commented Aug 25, 2023

978716899 commented Aug 29, 2023 •

edited

Loading

mp-17 commented Sep 6, 2023 •

edited

Loading

mp-17 commented Aug 12, 2024 •

edited

Loading

Maybe the vmv. x.s instruction seems to have problems in certain specific situations #241

Maybe the vmv. x.s instruction seems to have problems in certain specific situations #241

Comments

LeleJun97 commented Aug 24, 2023 • edited Loading

LeleJun97 commented Aug 25, 2023

978716899 commented Aug 29, 2023 • edited Loading

mp-17 commented Sep 6, 2023 • edited Loading

mp-17 commented Aug 12, 2024 • edited Loading

LeleJun97 commented Aug 24, 2023 •

edited

Loading

978716899 commented Aug 29, 2023 •

edited

Loading

mp-17 commented Sep 6, 2023 •

edited

Loading

mp-17 commented Aug 12, 2024 •

edited

Loading