-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_llama13.py
46 lines (37 loc) · 1.21 KB
/
run_llama13.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import torch
import sys
NUMA = int(sys.argv[1])
NITER = int(sys.argv[2])
new_alloc = torch.cuda.memory.CUDAPluggableAllocator(f'/users/lfusco/code/gh_benchmark/alloc{NUMA}.so', 'my_malloc', 'my_free')
torch.cuda.memory.change_current_allocator(new_alloc)
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
access_token = ''
cache_dir = '/bret/scratch/cscs/lfusco/.cache/huggingface'
device = 'cuda:0'
model_17 = "meta-llama/Llama-2-13b-chat-hf"
model_33 = "bofenghuang/vigogne-33b-instruct"
# model_33 = "alexl83/LLaMA-33B-HF"
tokenizer = AutoTokenizer.from_pretrained(model_33, token=access_token, cache_dir=cache_dir, legacy=True)
model = AutoModelForCausalLM.from_pretrained(model_33, token=access_token, cache_dir=cache_dir)
pipeline = transformers.pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
torch_dtype=torch.float16,
device=device
)
for i in range(NITER):
start = time.time()
sequences = pipeline(
'',
do_sample=True,
top_k=1,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id,
max_length=100,
# temperature=0.0001
)
end = time.time()
print(end - start)