Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issues related to the use of vmv and vredsum. #246

Closed
LeleJun97 opened this issue Sep 4, 2023 · 4 comments
Closed

Issues related to the use of vmv and vredsum. #246

LeleJun97 opened this issue Sep 4, 2023 · 4 comments
Labels
bug Something isn't working

Comments

@LeleJun97
Copy link

Last time I raised this issue, I thought it was a problem with vmv.x.s.

However, after careful investigation, I found out that it was not the case. The specific reason is that the ara processor freezes when certain instructions are executed in a particular order.

I extracted the dot product test program from the author and wrote a very simple test case:

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <stdint.h>
#include "hbird_sdk_soc.h"

int16_t dotp_v16b(int16_t *a, int16_t *b, uint64_t avl) {
#ifdef INTRINSICS

  size_t orig_avl = avl;
  size_t vl = vsetvl_e16m8(avl);

  vint16m8_t acc, buf_a, buf_b;
  vint16m1_t red;

  int16_t *a_ = (int16_t *)a;
  int16_t *b_ = (int16_t *)b;

  // Clean the accumulator
  red = vmv_s_x_i16m1(red, 0, vl);
  // Stripmine and accumulate a partial reduced vector
  for (; avl > 0; avl -= vl) {
    vl = vsetvl_e16m8(avl);
    // Load chunk a and b
    buf_a = vle16_v_i16m8(a_, vl);
    buf_b = vle16_v_i16m8(b_, vl);
    // Multiply and accumulate
    if (avl == orig_avl) {
      acc = vmul_vv_i16m8(buf_a, buf_b, vl);
    } else {
      acc = vmacc_vv_i16m8(acc, buf_a, buf_b, vl);
    }
    // Bump pointers
    a_ += vl;
    b_ += vl;
  }

  // Reduce and store
  red = vredsum_vs_i16m8_i16m1(red, acc, red, vl);
  return vmv_x_s_i16m1_i16(red);

#else

  size_t orig_avl = avl;
  size_t vl;
  asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));

  int16_t red;

  int16_t *a_ = (int16_t *)a;
  int16_t *b_ = (int16_t *)b;

  // Clean the accumulator
  asm volatile("vmv.s.x v0, zero");
  // Stripmine and accumulate a partial reduced vector
  for (; avl > 0; avl -= vl) {
    asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
    // Load chunk a and b
    asm volatile("vle16.v v8,  (%0)" ::"r"(a_));
    asm volatile("vle16.v v16, (%0)" ::"r"(b_));
    // Multiply and accumulate
    if (avl == orig_avl) {
      asm volatile("vmul.vv v24, v8, v16");
    } else {
      asm volatile("vmacc.vv v24, v8, v16");
    }
    // Bump pointers
    a_ += vl;
    b_ += vl;
  }

  // Reduce and return
  asm volatile("vredsum.vs v0, v24, v0");
  asm volatile("vmv.x.s %0, v0" : "=r"(red));
  return red;

#endif
}

void printnumb(uint16_t *numb, uint32_t N) {
  uint32_t i;
  for (i = 0; i < N; i++) {
    printf("%d ", numb[i]);
  }
  printf("\n");
}

int main(){

	int16_t x[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};					
	int16_t y[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
	int16_t zv;
	printf("X\n");
	printnumb(x, 9);
	printf("Y\n");
	printnumb(y, 9);

  zv = dotp_v16b(x, y, 9);
	printf("\n");
	printf("ZV\n");
	printf("%d \n", zv);
        printf("success!\n");
	return 0;
}

The meaning of this test case is to compute the dot product of {1, 2, 3, 4, 5, 6, 7, 8, 9}, which should result in 285.

X
1 2 3 4 5 6 7 8 9 
Y
1 2 3 4 5 6 7 8 9 

ZV
285 
success!

But when I move “asm volatile(“vmv.s.x v0, zero”);” before “asm volatile(“vredsum.vs v0, v24, v0”);” as shown in the following diagram.
图片
The result cannot be output when I do that.

Why???

@LeleJun97
Copy link
Author

I tried the given code in the “spike” simulator, and the program ran successfully with correct results. By using “verdi” to trace the signals, I found that there is an issue with the arbitration count in the “operand_requester” module’s “rr_arb_tree”.
A temporary solution now is to replace the instruction “vmv.s.x” with the instruction “vmv.v.i” to avoid the problem.

@mp-17 mp-17 added the bug Something isn't working label Sep 6, 2023
@mp-17
Copy link
Collaborator

mp-17 commented Sep 6, 2023

Hello @LeleJun97, I am investigating it right now, thanks for raising the issue!

@mp-17
Copy link
Collaborator

mp-17 commented Sep 6, 2023

This is weird, I can simulate this program successfully with both QuestaSim and Verilator from the main branch (I removed one spurious header and replaced stdio.h with our lighter printf.h):

#include <printf.h>
#include <stdlib.h>
#include <stdint.h>

int16_t dotp_v16b(int16_t *a, int16_t *b, uint64_t avl) {
#ifdef INTRINSICS

  size_t orig_avl = avl;
  size_t vl = vsetvl_e16m8(avl);

  vint16m8_t acc, buf_a, buf_b;
  vint16m1_t red;

  int16_t *a_ = (int16_t *)a;
  int16_t *b_ = (int16_t *)b;

  // Clean the accumulator
  red = vmv_s_x_i16m1(red, 0, vl);
  // Stripmine and accumulate a partial reduced vector
  for (; avl > 0; avl -= vl) {
    vl = vsetvl_e16m8(avl);
    // Load chunk a and b
    buf_a = vle16_v_i16m8(a_, vl);
    buf_b = vle16_v_i16m8(b_, vl);
    // Multiply and accumulate
    if (avl == orig_avl) {
      acc = vmul_vv_i16m8(buf_a, buf_b, vl);
    } else {
      acc = vmacc_vv_i16m8(acc, buf_a, buf_b, vl);
    }
    // Bump pointers
    a_ += vl;
    b_ += vl;
  }

  // Reduce and store
  red = vredsum_vs_i16m8_i16m1(red, acc, red, vl);
  return vmv_x_s_i16m1_i16(red);

#else

  size_t orig_avl = avl;
  size_t vl;
  asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));

  int16_t red;

  int16_t *a_ = (int16_t *)a;
  int16_t *b_ = (int16_t *)b;

  // Clean the accumulator
//  asm volatile("vmv.s.x v0, zero");
  // Stripmine and accumulate a partial reduced vector
  for (; avl > 0; avl -= vl) {
    asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
    // Load chunk a and b
    asm volatile("vle16.v v8,  (%0)" ::"r"(a_));
    asm volatile("vle16.v v16, (%0)" ::"r"(b_));
    // Multiply and accumulate
    if (avl == orig_avl) {
      asm volatile("vmul.vv v24, v8, v16");
    } else {
      asm volatile("vmacc.vv v24, v8, v16");
    }
    // Bump pointers
    a_ += vl;
    b_ += vl;
  }

  // Reduce and return
  asm volatile("vmv.s.x v0, zero");
  asm volatile("vredsum.vs v0, v24, v0");
  asm volatile("vmv.x.s %0, v0" : "=r"(red));
  return red;

#endif
}

void printnumb(uint16_t *numb, uint32_t N) {
  uint32_t i;
  for (i = 0; i < N; i++) {
    printf("%d ", numb[i]);
  }
  printf("\n");
}

int main(){

	int16_t x[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
	int16_t y[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
	int16_t zv;
	printf("X\n");
	printnumb(x, 9);
	printf("Y\n");
	printnumb(y, 9);

	zv = dotp_v16b(x, y, 9);
	printf("\n");
	printf("ZV\n");
	printf("%d \n", zv);
	printf("success!\n");
	return 0;
}

@mp-17
Copy link
Collaborator

mp-17 commented Aug 12, 2024

Let me know if #342 fixes the issue!

@mp-17 mp-17 closed this as completed Aug 12, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

2 participants