Issues related to the use of vmv and vredsum. #246

LeleJun97 · 2023-09-04T13:26:56Z

Last time I raised this issue, I thought it was a problem with vmv.x.s.

However, after careful investigation, I found out that it was not the case. The specific reason is that the ara processor freezes when certain instructions are executed in a particular order.

I extracted the dot product test program from the author and wrote a very simple test case:

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <stdint.h>
#include "hbird_sdk_soc.h"

int16_t dotp_v16b(int16_t *a, int16_t *b, uint64_t avl) {
#ifdef INTRINSICS

  size_t orig_avl = avl;
  size_t vl = vsetvl_e16m8(avl);

  vint16m8_t acc, buf_a, buf_b;
  vint16m1_t red;

  int16_t *a_ = (int16_t *)a;
  int16_t *b_ = (int16_t *)b;

  // Clean the accumulator
  red = vmv_s_x_i16m1(red, 0, vl);
  // Stripmine and accumulate a partial reduced vector
  for (; avl > 0; avl -= vl) {
    vl = vsetvl_e16m8(avl);
    // Load chunk a and b
    buf_a = vle16_v_i16m8(a_, vl);
    buf_b = vle16_v_i16m8(b_, vl);
    // Multiply and accumulate
    if (avl == orig_avl) {
      acc = vmul_vv_i16m8(buf_a, buf_b, vl);
    } else {
      acc = vmacc_vv_i16m8(acc, buf_a, buf_b, vl);
    }
    // Bump pointers
    a_ += vl;
    b_ += vl;
  }

  // Reduce and store
  red = vredsum_vs_i16m8_i16m1(red, acc, red, vl);
  return vmv_x_s_i16m1_i16(red);

#else

  size_t orig_avl = avl;
  size_t vl;
  asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));

  int16_t red;

  int16_t *a_ = (int16_t *)a;
  int16_t *b_ = (int16_t *)b;

  // Clean the accumulator
  asm volatile("vmv.s.x v0, zero");
  // Stripmine and accumulate a partial reduced vector
  for (; avl > 0; avl -= vl) {
    asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
    // Load chunk a and b
    asm volatile("vle16.v v8,  (%0)" ::"r"(a_));
    asm volatile("vle16.v v16, (%0)" ::"r"(b_));
    // Multiply and accumulate
    if (avl == orig_avl) {
      asm volatile("vmul.vv v24, v8, v16");
    } else {
      asm volatile("vmacc.vv v24, v8, v16");
    }
    // Bump pointers
    a_ += vl;
    b_ += vl;
  }

  // Reduce and return
  asm volatile("vredsum.vs v0, v24, v0");
  asm volatile("vmv.x.s %0, v0" : "=r"(red));
  return red;

#endif
}

void printnumb(uint16_t *numb, uint32_t N) {
  uint32_t i;
  for (i = 0; i < N; i++) {
    printf("%d ", numb[i]);
  }
  printf("\n");
}

int main(){

	int16_t x[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};					
	int16_t y[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
	int16_t zv;
	printf("X\n");
	printnumb(x, 9);
	printf("Y\n");
	printnumb(y, 9);

  zv = dotp_v16b(x, y, 9);
	printf("\n");
	printf("ZV\n");
	printf("%d \n", zv);
        printf("success!\n");
	return 0;
}

The meaning of this test case is to compute the dot product of {1, 2, 3, 4, 5, 6, 7, 8, 9}, which should result in 285.

X
1 2 3 4 5 6 7 8 9 
Y
1 2 3 4 5 6 7 8 9 

ZV
285 
success!

But when I move “asm volatile(“vmv.s.x v0, zero”);” before “asm volatile(“vredsum.vs v0, v24, v0”);” as shown in the following diagram.

The result cannot be output when I do that.

Why???

The text was updated successfully, but these errors were encountered:

LeleJun97 · 2023-09-06T07:51:46Z

I tried the given code in the “spike” simulator, and the program ran successfully with correct results. By using “verdi” to trace the signals, I found that there is an issue with the arbitration count in the “operand_requester” module’s “rr_arb_tree”.
A temporary solution now is to replace the instruction “vmv.s.x” with the instruction “vmv.v.i” to avoid the problem.

mp-17 · 2023-09-06T20:40:19Z

Hello @LeleJun97, I am investigating it right now, thanks for raising the issue!

mp-17 · 2023-09-06T20:50:56Z

This is weird, I can simulate this program successfully with both QuestaSim and Verilator from the main branch (I removed one spurious header and replaced stdio.h with our lighter printf.h):

#include <printf.h>
#include <stdlib.h>
#include <stdint.h>

int16_t dotp_v16b(int16_t *a, int16_t *b, uint64_t avl) {
#ifdef INTRINSICS

  size_t orig_avl = avl;
  size_t vl = vsetvl_e16m8(avl);

  vint16m8_t acc, buf_a, buf_b;
  vint16m1_t red;

  int16_t *a_ = (int16_t *)a;
  int16_t *b_ = (int16_t *)b;

  // Clean the accumulator
  red = vmv_s_x_i16m1(red, 0, vl);
  // Stripmine and accumulate a partial reduced vector
  for (; avl > 0; avl -= vl) {
    vl = vsetvl_e16m8(avl);
    // Load chunk a and b
    buf_a = vle16_v_i16m8(a_, vl);
    buf_b = vle16_v_i16m8(b_, vl);
    // Multiply and accumulate
    if (avl == orig_avl) {
      acc = vmul_vv_i16m8(buf_a, buf_b, vl);
    } else {
      acc = vmacc_vv_i16m8(acc, buf_a, buf_b, vl);
    }
    // Bump pointers
    a_ += vl;
    b_ += vl;
  }

  // Reduce and store
  red = vredsum_vs_i16m8_i16m1(red, acc, red, vl);
  return vmv_x_s_i16m1_i16(red);

#else

  size_t orig_avl = avl;
  size_t vl;
  asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));

  int16_t red;

  int16_t *a_ = (int16_t *)a;
  int16_t *b_ = (int16_t *)b;

  // Clean the accumulator
//  asm volatile("vmv.s.x v0, zero");
  // Stripmine and accumulate a partial reduced vector
  for (; avl > 0; avl -= vl) {
    asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
    // Load chunk a and b
    asm volatile("vle16.v v8,  (%0)" ::"r"(a_));
    asm volatile("vle16.v v16, (%0)" ::"r"(b_));
    // Multiply and accumulate
    if (avl == orig_avl) {
      asm volatile("vmul.vv v24, v8, v16");
    } else {
      asm volatile("vmacc.vv v24, v8, v16");
    }
    // Bump pointers
    a_ += vl;
    b_ += vl;
  }

  // Reduce and return
  asm volatile("vmv.s.x v0, zero");
  asm volatile("vredsum.vs v0, v24, v0");
  asm volatile("vmv.x.s %0, v0" : "=r"(red));
  return red;

#endif
}

void printnumb(uint16_t *numb, uint32_t N) {
  uint32_t i;
  for (i = 0; i < N; i++) {
    printf("%d ", numb[i]);
  }
  printf("\n");
}

int main(){

	int16_t x[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
	int16_t y[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
	int16_t zv;
	printf("X\n");
	printnumb(x, 9);
	printf("Y\n");
	printnumb(y, 9);

	zv = dotp_v16b(x, y, 9);
	printf("\n");
	printf("ZV\n");
	printf("%d \n", zv);
	printf("success!\n");
	return 0;
}

mp-17 · 2024-08-12T12:44:35Z

Let me know if #342 fixes the issue!

mp-17 added the bug Something isn't working label Sep 6, 2023

mp-17 mentioned this issue Sep 6, 2023

Maybe the vmv. x.s instruction seems to have problems in certain specific situations #241

Closed

mp-17 closed this as completed Aug 12, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Issues related to the use of vmv and vredsum. #246

Issues related to the use of vmv and vredsum. #246

LeleJun97 commented Sep 4, 2023

LeleJun97 commented Sep 6, 2023

mp-17 commented Sep 6, 2023

mp-17 commented Sep 6, 2023 •

edited

Loading

mp-17 commented Aug 12, 2024 •

edited

Loading

Issues related to the use of vmv and vredsum. #246

Issues related to the use of vmv and vredsum. #246

Comments

LeleJun97 commented Sep 4, 2023

LeleJun97 commented Sep 6, 2023

mp-17 commented Sep 6, 2023

mp-17 commented Sep 6, 2023 • edited Loading

mp-17 commented Aug 12, 2024 • edited Loading

mp-17 commented Sep 6, 2023 •

edited

Loading

mp-17 commented Aug 12, 2024 •

edited

Loading