Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update llama.cpp binaries to 5f631c2 and align the LlamaContext #77

Merged
merged 3 commits into from
Aug 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -342,4 +342,6 @@ test/TensorFlowNET.Examples/mnist

# docs
site/

/LLama.Unittest/Models/*.bin

1 change: 1 addition & 0 deletions LLama.Unittest/BasicTest.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using LLama;
using LLama.Common;

namespace LLama.Unittest
Expand Down
1 change: 0 additions & 1 deletion LLama.Unittest/LLama.Unittest.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,4 @@
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>

</Project>
2 changes: 1 addition & 1 deletion LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ public class ModelParams
/// <summary>
/// how split tensors should be distributed across GPUs
/// </summary>
public float[] TensorSplits { get; set; } = new float[] { 0 };
public nint TensorSplits { get; set; }

/// <summary>
///
Expand Down
4 changes: 2 additions & 2 deletions LLama/LLamaModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ public void LoadState(State state)
/// <param name="tfsZ"></param>
/// <param name="typicalP"></param>
/// <returns></returns>
public llama_token Sample(LLamaTokenDataArray candidates, ref float mirostat_mu, float temperature = 0.8f, MiroStatType mirostat = MiroStatType.Disable,
public llama_token Sample(LLamaTokenDataArray candidates, ref float mirostat_mu, float temperature = 0.8f, MirostatType mirostat = MirostatType.Disable,
float mirostatTau = 5.0f, float mirostatEta = 0.1f, int topK = 40, float topP = 0.95f, float tfsZ = 1.0f, float typicalP = 1.0f)
{
llama_token id;
Expand All @@ -244,7 +244,7 @@ public llama_token Sample(LLamaTokenDataArray candidates, ref float mirostat_mu,
if (float.IsNaN(mirostat_mu))
mirostat_mu = 2 * mirostatTau;

if (mirostat == MiroStatType.MiroStat)
if (mirostat == MirostatType.Mirostat)
{
const int mirostat_m = 100;
SamplingApi.llama_sample_temperature(_ctx, candidates, temperature);
Expand Down
14 changes: 8 additions & 6 deletions LLama/Native/LLamaContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ public struct LLamaContextParams
/// <summary>
/// how to split layers across multiple GPUs
/// </summary>
public float[] tensor_split;
public nint tensor_split;


/// <summary>
/// ref: https://github.com/ggerganov/llama.cpp/pull/2054
Expand Down Expand Up @@ -78,6 +79,11 @@ public struct LLamaContextParams
[MarshalAs(UnmanagedType.I1)]
public bool low_vram;

/// <summary>
/// if true, use experimental mul_mat_q kernels
/// </summary>
[MarshalAs(UnmanagedType.I1)] public bool mul_mat_q;

/// <summary>
/// use fp16 for KV cache
/// </summary>
Expand Down Expand Up @@ -114,9 +120,5 @@ public struct LLamaContextParams
[MarshalAs(UnmanagedType.I1)]
public bool embedding;
}

public struct TensorSplits
{
public float Item1;
}
}

6 changes: 4 additions & 2 deletions LLama/Utils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@
lparams.logits_all = @params.Perplexity;
lparams.embedding = @params.EmbeddingMode;
lparams.low_vram = @params.LowVram;


/*
if (@params.TensorSplits.Length != 1)
{
throw new ArgumentException("Currently multi-gpu support is not supported by " +
"both llama.cpp and LLamaSharp.");
}
}*/

lparams.tensor_split = @params.TensorSplits;

if (!File.Exists(@params.ModelPath))
Expand Down Expand Up @@ -89,15 +91,15 @@
#if NET6_0_OR_GREATER
if(encoding == Encoding.UTF8)
{
return Marshal.PtrToStringUTF8(ptr);

Check warning on line 94 in LLama/Utils.cs

View workflow job for this annotation

GitHub Actions / Test (linux-debug)

Possible null reference return.

Check warning on line 94 in LLama/Utils.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

Possible null reference return.

Check warning on line 94 in LLama/Utils.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

Possible null reference return.
}
else if(encoding == Encoding.Unicode)
{
return Marshal.PtrToStringUni(ptr);

Check warning on line 98 in LLama/Utils.cs

View workflow job for this annotation

GitHub Actions / Test (linux-debug)

Possible null reference return.

Check warning on line 98 in LLama/Utils.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

Possible null reference return.

Check warning on line 98 in LLama/Utils.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

Possible null reference return.
}
else
{
return Marshal.PtrToStringAuto(ptr);

Check warning on line 102 in LLama/Utils.cs

View workflow job for this annotation

GitHub Actions / Test (linux-debug)

Possible null reference return.

Check warning on line 102 in LLama/Utils.cs

View workflow job for this annotation

GitHub Actions / Test (linux-release)

Possible null reference return.

Check warning on line 102 in LLama/Utils.cs

View workflow job for this annotation

GitHub Actions / Test (windows-release)

Possible null reference return.
}
#else
byte* tp = (byte*)ptr.ToPointer();
Expand Down
127 changes: 72 additions & 55 deletions LLama/runtimes/ggml-metal.metal
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,17 @@ kernel void kernel_add(
dst[tpig] = src0[tpig] + src1[tpig];
}

// assumption: src1 is a row
// broadcast src1 into src0
kernel void kernel_add_row(
device const float * src0,
device const float * src1,
device float * dst,
constant int64_t & ne00,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] + src1[tpig % ne00];
}

kernel void kernel_mul(
device const float * src0,
device const float * src1,
Expand Down Expand Up @@ -376,87 +387,90 @@ kernel void kernel_rms_norm(
}
}

// function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i])
float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl) {
// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
// il indicates where the q4 quants begin (0 or QK4_0/4)
// we assume that the yl's have been multiplied with the appropriate scale factor
// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
float d = qb_curr->d;
float4 acc = 0.f;
device uint16_t * qs = ((device uint16_t *)qb_curr + 1);
for (int i = 0; i < 16; i+=2) {
acc[0] += yl[i] * (qs[i / 2] & 0x000F);
acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00);
acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
float2 acc = 0.f;
device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
for (int i = 0; i < 8; i+=2) {
acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
+ yl[i + 1] * (qs[i / 2] & 0x0F00);
acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
+ yl[i + 9] * (qs[i / 2] & 0xF000);
}
return d * (sumy * -8.f + acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f);
return d * (sumy * -8.f + acc[0] + acc[1]);
}

// function for calculate inner product between a q4_1 block and 32 floats (yl), sumy is SUM(yl[i])
float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl) {
// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i])
// il indicates where the q4 quants begin (0 or QK4_0/4)
// we assume that the yl's have been multiplied with the appropriate scale factor
// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
float d = qb_curr->d;
float m = qb_curr->m;
float4 acc = 0.f;
device uint16_t * qs = ((device uint16_t *)qb_curr + 2);
for (int i = 0; i < 16; i+=2) {
acc[0] += yl[i] * (qs[i / 2] & 0x000F);
acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00);
acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
float2 acc = 0.f;
for (int i = 0; i < 8; i+=2) {
acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
+ yl[i + 1] * (qs[i / 2] & 0x0F00);
acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
+ yl[i + 9] * (qs[i / 2] & 0xF000);
}
return d * (acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f) + sumy * m;
return d * (acc[0] + acc[1]) + sumy * m;
}

// putting them in the kernel cause a significant performance penalty
#define N_DST 4 // each SIMD group works on 4 rows
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
template<typename block_q_type>
//Note: This is a template, but strictly speaking it only applies to
// quantizations where the block size is 32. It also does not
// giard against the number of rows not being divisible by
// N_DST, so this is another explicit assumption of the implementation.
template<typename block_q_type, int nr, int nsg, int nw>
void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
uint2 tgpig, uint tiisg, uint sgitg) {
const int nb = ne00/QK4_0;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
const int first_row = (r0 * nsg + sgitg) * nr;
device const block_q_type * x = (device const block_q_type *) src0 + first_row * nb;
device const float * y = (device const float *) src1 + r1*ne10;
float4 y_curr[8]; // src1 vector cache
float sumf[N_DST]={0.f}, all_sum;
thread float * yl=(thread float *)y_curr;
float yl[16]; // src1 vector cache
float sumf[nr]={0.f};

// each thread in a SIMD group deals with 1 block.
for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
float sumy = 0;
for (int i = 0; i < QK4_0 / 4; i++) {
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i);
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
}
const int ix = tiisg/2;
const int il = 8*(tiisg%2);

for (int row = 0; row < N_DST; row++) {
sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl);
}
}
device const float * yb = y + ix * QK4_0 + il;

// from now loads two rows every time and 16 blocks per row
int ir = tiisg / (N_SIMDWIDTH / 2);
int ib = tiisg % (N_SIMDWIDTH / 2);
for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) {
int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start
// each thread in a SIMD group deals with half a block.
for (int ib = ix; ib < nb; ib += nw/2) {
float sumy = 0;
for (int i = 0; i < QK4_0 / 4; i++) {
y_curr[i] = *((device float4 *)(y + (nb_start + ib) * QK4_0) + i);
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
for (int i = 0; i < 8; i += 2) {
sumy += yb[i] + yb[i+1];
yl[i+0] = yb[i+ 0];
yl[i+1] = yb[i+ 1]/256.f;
sumy += yb[i+16] + yb[i+17];
yl[i+8] = yb[i+16]/16.f;
yl[i+9] = yb[i+17]/4096.f;
}

for (int row = 0; row < N_DST; row+=2) {
if (nb_start + ib < nb) {
sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl);
}
for (int row = 0; row < nr; row++) {
sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il);
}

yb += QK4_0 * 16;
}

for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
for (int row = 0; row < nr; ++row) {
const float tot = simd_sum(sumf[row]);
if (tiisg == 0 && first_row + row < ne01) {
dst[r1*ne0 + first_row + row] = tot;
}
}
}
Expand All @@ -472,7 +486,7 @@ kernel void kernel_mul_mat_q4_0_f32(
uint2 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
mul_vec_q_n_f32<block_q4_0>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
}

kernel void kernel_mul_mat_q4_1_f32(
Expand All @@ -486,7 +500,7 @@ kernel void kernel_mul_mat_q4_1_f32(
uint2 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
mul_vec_q_n_f32<block_q4_1>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
}

kernel void kernel_mul_mat_f16_f32(
Expand All @@ -495,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
Expand All @@ -515,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
const int64_t r1 = tgpig.y;
const int64_t im = tgpig.z;

device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);

sum[tpitg.x] = 0.0f;
Expand All @@ -538,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
}
}


kernel void kernel_alibi_f32(
device const float * src0,
device float * dst,
Expand Down
Binary file modified LLama/runtimes/libllama.dylib
Binary file not shown.
Loading