-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from paul-sheridan/v1-release
V1 release
- Loading branch information
Showing
20 changed files
with
758 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import pickle | ||
import random | ||
# Make sure processData.py is in the same directory or in the PYTHONPATH | ||
from processData import process_data | ||
|
||
# Path to the original and new pickle files | ||
input_path = 'dataGenaration/result/result-gptNeo2.7b.pkl' | ||
output_path = 'data/proccessData2.7.pkl' | ||
|
||
# Load the original pickle file | ||
with open(input_path, 'rb') as file: | ||
original_data = pickle.load(file) | ||
print(original_data) | ||
random.shuffle(original_data) | ||
# Process the original data using the imported function | ||
processed_data = [process_data(item) for item in original_data] | ||
|
||
# Now, we will save the processed data as an array of arrays | ||
with open(output_path, 'wb') as outfile: | ||
pickle.dump(processed_data, outfile) | ||
|
||
print("Data processing complete and saved to proccessData125m.pkl") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{"meta": {"pmid": 11409574, "language": "eng"}, "text": " in children with ARI and relative risks for the association "} | ||
{"meta": {"pmid": 11409574, "language": "eng"}, "text": " in children with ARI and relative risks for the association "} | ||
{"meta": {"pmid": 11409574, "language": "eng"}, "text": " in children with ARI and relative risks for the association "} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import pickle | ||
from transformers import GPT2Tokenizer | ||
|
||
def decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file): | ||
tokenizer = GPT2Tokenizer.from_pretrained(model_name) | ||
|
||
all_decoded_texts = [] | ||
|
||
for i in range(start_idx, end_idx + 1): | ||
input_file = f"{input_folder}/gpt-neo-125m-{i}.pkl" | ||
print(f"Processing {input_file}...") | ||
|
||
# Load the generated text (as token IDs) | ||
try: | ||
with open(input_file, 'rb') as f: | ||
generated_text_ids = pickle.load(f) | ||
|
||
# Decode the text | ||
decoded_texts = [tokenizer.decode(text_id, skip_special_tokens=True) for text_id in generated_text_ids] | ||
all_decoded_texts.extend(decoded_texts) | ||
except FileNotFoundError: | ||
print(f"File not found: {input_file}") | ||
except Exception as e: | ||
print(f"An error occurred while processing {input_file}: {str(e)}") | ||
|
||
# Save all decoded text to a file | ||
with open(output_file, 'wb') as f: | ||
pickle.dump(all_decoded_texts, f) | ||
|
||
print(f"Decoded texts saved to {output_file}") | ||
|
||
if __name__ == '__main__': | ||
model_name = 'EleutherAI/gpt-neo-125M' | ||
start_idx = 2 | ||
end_idx = 20 | ||
input_folder = 'dataGenaration/gpt-neo-125m' | ||
output_file = 'dataGenaration/result/result-gptNeo125m.pkl' | ||
decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
|
||
# Number of scripts you want to create | ||
num_scripts=20 | ||
step=25000 | ||
|
||
for i in $(seq 1 $num_scripts); do | ||
start_point=$(( ($i-1) * $step )) | ||
end_point=$(( $i * $step )) | ||
file_name="gpt-neo-125m-$i.py" | ||
output_pickle_file="gpt-neo-125m-$i.pkl" | ||
|
||
# Generate the Python script | ||
cat > $file_name <<EOF | ||
import torch | ||
import pickle | ||
from torch.utils.data import Dataset, DataLoader | ||
from transformers import GPTNeoForCausalLM, GPT2Tokenizer | ||
class VarrianDataset(Dataset): | ||
def __init__(self, input_ids, lengths): | ||
self.input_ids = input_ids | ||
self.lengths = lengths | ||
def __len__(self): | ||
return len(self.input_ids) | ||
def __getitem__(self, idx): | ||
return self.input_ids[idx], self.lengths[idx] | ||
class LLMsGeneration: | ||
def __init__(self, model, tokenizer, device, start_point, end_point): | ||
self.model = model | ||
self.tokenizer = tokenizer | ||
self.device = device | ||
self.startPoint = start_point | ||
self.endPoint = end_point | ||
self.rawDoc = None | ||
def loadArray(self, prompt): | ||
with open(prompt, 'rb') as f: | ||
data = pickle.load(f)[self.startPoint:self.endPoint] | ||
prompts, lengths = zip(*data) | ||
prompts = [" ".join(prompt) for prompt in prompts] | ||
all_input_ids = self.tokenizer(prompts, return_tensors="pt", truncation=True, padding="max_length", max_length=2048).input_ids | ||
all_input_ids = all_input_ids.to(self.device) | ||
dataset = VarrianDataset(all_input_ids, lengths) | ||
dataloader = DataLoader(dataset, batch_size=32, shuffle=False) | ||
outputs = [] | ||
max_position_embeddings = self.model.config.max_position_embeddings | ||
for batch_input_ids, batch_lengths in dataloader: | ||
if batch_input_ids.shape[1] > max_position_embeddings: | ||
raise ValueError(f"Input IDs length ({batch_input_ids.shape[1]}) exceeds model's max position embeddings ({max_position_embeddings}).") | ||
max_desired_length = max(batch_lengths).item() | ||
total_length = batch_input_ids.shape[1] + max_desired_length | ||
if total_length > max_position_embeddings: | ||
print(f"Warning: Desired total length ({total_length}) exceeds model's max position embeddings ({max_position_embeddings}). Truncating to {max_position_embeddings}.") | ||
total_length = max_position_embeddings | ||
generated = self.model.generate(batch_input_ids, | ||
max_length=total_length, | ||
pad_token_id=self.tokenizer.pad_token_id) | ||
outputs.extend(generated.cpu().numpy()) | ||
self.rawDoc = outputs | ||
def decode(self, data): | ||
return self.tokenizer.decode(data) | ||
if __name__ == '__main__': | ||
# Set up start and end points, model, tokenizer, and device | ||
start_point = $start_point | ||
end_point = $end_point | ||
output_pickle_file = '$output_pickle_file' | ||
model_name = 'EleutherAI/gpt-neo-125m' | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
# Initialize model and tokenizer | ||
model = GPTNeoForCausalLM.from_pretrained(model_name).to(device) | ||
tokenizer = GPT2Tokenizer.from_pretrained(model_name) | ||
# Set padding token | ||
tokenizer.pad_token = tokenizer.eos_token | ||
# Initialize LLMsGeneration instance | ||
llms_generation = LLMsGeneration(model, tokenizer, device, start_point, end_point) | ||
# Load array and generate text | ||
llms_generation.loadArray('data/promptSelection.pickle') | ||
# Save the generated text | ||
with open(output_pickle_file, 'wb') as file: | ||
pickle.dump(llms_generation.rawDoc, file) | ||
EOF | ||
|
||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/bin/bash | ||
|
||
for i in {1..20} | ||
do | ||
output_file="slurm-job-${i}.sh" | ||
echo "Creating $output_file..." | ||
|
||
cat <<EOT > $output_file | ||
#!/bin/bash | ||
#SBATCH --account=your_account_name | ||
#SBATCH --gpus-per-node=a100:1 | ||
#SBATCH --nodes=1 | ||
#SBATCH --cpus-per-task=3 | ||
#SBATCH --mem=70G | ||
#SBATCH --time=0-15:00 | ||
module load python/3.11.2 | ||
module load StdEnv/2020 | ||
python gpt-neo-125m-${i}.py | ||
EOT | ||
done | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/bin/bash | ||
|
||
# Loop through the generated slurm job scripts and submit them | ||
for i in {2..20} | ||
do | ||
slurm_file="slurm-job-${i}.sh" | ||
echo "Submitting $slurm_file..." | ||
sbatch $slurm_file | ||
done | ||
|
||
echo "All jobs submitted." | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import pickle | ||
from transformers import GPT2Tokenizer | ||
|
||
def decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file): | ||
tokenizer = GPT2Tokenizer.from_pretrained(model_name) | ||
|
||
all_decoded_texts = [] | ||
|
||
for i in range(start_idx, end_idx + 1): | ||
input_file = f"{input_folder}/gpt-neo-125m-{i}.pkl" | ||
print(f"Processing {input_file}...") | ||
|
||
# Load the generated text (as token IDs) | ||
try: | ||
with open(input_file, 'rb') as f: | ||
generated_text_ids = pickle.load(f) | ||
|
||
# Decode the text | ||
decoded_texts = [tokenizer.decode(text_id, skip_special_tokens=True) for text_id in generated_text_ids] | ||
all_decoded_texts.extend(decoded_texts) | ||
except FileNotFoundError: | ||
print(f"File not found: {input_file}") | ||
except Exception as e: | ||
print(f"An error occurred while processing {input_file}: {str(e)}") | ||
|
||
# Save all decoded text to a file | ||
with open(output_file, 'wb') as f: | ||
pickle.dump(all_decoded_texts, f) | ||
|
||
print(f"Decoded texts saved to {output_file}") | ||
|
||
if __name__ == '__main__': | ||
model_name = 'EleutherAI/gpt-neo-125M' | ||
start_idx = 2 | ||
end_idx = 20 | ||
input_folder = 'dataGenaration/gpt-neo-125m' | ||
output_file = 'dataGenaration/result/result-gptNeo125m.pkl' | ||
decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file) | ||
|
Oops, something went wrong.