From c0ba4f882f984f5c69e049af291cb44f234739d8 Mon Sep 17 00:00:00 2001 From: paul-sheridan Date: Tue, 24 Oct 2023 10:55:37 -0300 Subject: [PATCH 01/25] added title --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8a9fb9e..4a99f5f 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,3 @@ -# paper-heaps-law-llm +# Heaps' Law in GPT-Neo Large Language Model Emulated Corpora + Official repository for the workshop paper Heaps' Law in GPT-Neo Large Language Model Emulated Corpora From 5b16101043bd24b713b15b759d2b68bc0e9c11f4 Mon Sep 17 00:00:00 2001 From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com> Date: Fri, 3 Nov 2023 16:21:20 -0300 Subject: [PATCH 02/25] first commit commit the first version of the code --- gitHubOfficalCode/.idea/.gitignore | 3 + gitHubOfficalCode/.idea/gitHubOfficalCode.iml | 8 + .../inspectionProfiles/Project_Default.xml | 157 ++++++++++++++++++ .../inspectionProfiles/profiles_settings.xml | 6 + gitHubOfficalCode/.idea/misc.xml | 4 + gitHubOfficalCode/.idea/modules.xml | 8 + gitHubOfficalCode/DataSelection.py | 31 ++++ gitHubOfficalCode/DrawThePlotAndEstimation.py | 57 +++++++ gitHubOfficalCode/cleanData.py | 23 +++ gitHubOfficalCode/data/test.jsonl | 3 + .../dataGenaration/gpt-neo-1.3b/decode.py | 39 +++++ .../gpt-neo-1.3b/generate_python_scripts.sh | 105 ++++++++++++ .../gpt-neo-1.3b/generate_slurm_scripts.sh | 22 +++ .../gpt-neo-1.3b/submit_all_jobs.sh | 12 ++ .../dataGenaration/gpt-neo-125m/decode.py | 39 +++++ .../gpt-neo-125m/generate_python_scripts.sh | 105 ++++++++++++ .../gpt-neo-125m/generate_slurm_scripts.sh | 22 +++ .../gpt-neo-125m/submit_all_jobs.sh | 12 ++ .../dataGenaration/gpt-neo-2.7b/decode.py | 39 +++++ .../gpt-neo-2.7b/generate_python_scripts.sh | 105 ++++++++++++ .../gpt-neo-2.7b/generate_slurm_scripts.sh | 22 +++ .../gpt-neo-2.7b/submit_all_jobs.sh | 12 ++ gitHubOfficalCode/heaplaw.py | 39 +++++ gitHubOfficalCode/processData.py | 14 ++ gitHubOfficalCode/promtSelection.py | 48 ++++++ 25 files changed, 935 insertions(+) create mode 100644 gitHubOfficalCode/.idea/.gitignore create mode 100644 gitHubOfficalCode/.idea/gitHubOfficalCode.iml create mode 100644 gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml create mode 100644 gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml create mode 100644 gitHubOfficalCode/.idea/misc.xml create mode 100644 gitHubOfficalCode/.idea/modules.xml create mode 100644 gitHubOfficalCode/DataSelection.py create mode 100644 gitHubOfficalCode/DrawThePlotAndEstimation.py create mode 100644 gitHubOfficalCode/cleanData.py create mode 100644 gitHubOfficalCode/data/test.jsonl create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/decode.py create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-125m/decode.py create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_python_scripts.sh create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-125m/submit_all_jobs.sh create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/decode.py create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh create mode 100644 gitHubOfficalCode/heaplaw.py create mode 100644 gitHubOfficalCode/processData.py create mode 100644 gitHubOfficalCode/promtSelection.py diff --git a/gitHubOfficalCode/.idea/.gitignore b/gitHubOfficalCode/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/gitHubOfficalCode/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/gitHubOfficalCode/.idea/gitHubOfficalCode.iml b/gitHubOfficalCode/.idea/gitHubOfficalCode.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/gitHubOfficalCode/.idea/gitHubOfficalCode.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml b/gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..4ea2cf8 --- /dev/null +++ b/gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,157 @@ + + + + \ No newline at end of file diff --git a/gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml b/gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/gitHubOfficalCode/.idea/misc.xml b/gitHubOfficalCode/.idea/misc.xml new file mode 100644 index 0000000..a971a2c --- /dev/null +++ b/gitHubOfficalCode/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/gitHubOfficalCode/.idea/modules.xml b/gitHubOfficalCode/.idea/modules.xml new file mode 100644 index 0000000..c7b47ab --- /dev/null +++ b/gitHubOfficalCode/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/gitHubOfficalCode/DataSelection.py b/gitHubOfficalCode/DataSelection.py new file mode 100644 index 0000000..8152087 --- /dev/null +++ b/gitHubOfficalCode/DataSelection.py @@ -0,0 +1,31 @@ +import json +import pickle +from processData import process_data + +def main(): + file_path = 'data/test.jsonl' + processed_data = [] + limit = 500000 + prompt_length = 5 + try: + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + data_chunk = json.loads(line).get('text', '') # Assuming each line is a JSON object with an 'abstract' field + processed_line = process_data(data_chunk) + if len(processed_line) > prompt_length: # Check if the processed line has more than 10 words + processed_data.append(processed_line) + if len(processed_data) >= limit: + break + with open('processData.pickle', 'wb') as f: + pickle.dump(processed_data, f) + print("Data has been processed and saved to 'processData.pickle'") + except FileNotFoundError: + print("The file was not found. Please check the file path.") + except json.JSONDecodeError: + print("Error decoding JSON. Please check the file content.") + except Exception as e: + print("An error occurred:", str(e)) + +if __name__ == "__main__": + main() + diff --git a/gitHubOfficalCode/DrawThePlotAndEstimation.py b/gitHubOfficalCode/DrawThePlotAndEstimation.py new file mode 100644 index 0000000..6e89451 --- /dev/null +++ b/gitHubOfficalCode/DrawThePlotAndEstimation.py @@ -0,0 +1,57 @@ +import math +import pickle +import numpy as np +from matplotlib import pyplot as plt + +class pubMed: + def __init__(self): + self.data = {} + + def loadDocument(self, filename, label): + with open(filename, 'rb') as f: + xy = pickle.load(f) + x, y = [], [] + for xAndy in xy: + x.append(xAndy[0]) + y.append(xAndy[1]) + # x.append(math.log10(xAndy[0])) + # y.append(math.log10(xAndy[1])) + self.data[label] = (x, y) + + def DrawHeapLaw(self, step): + colors = ['r', 'g', 'b'] + plt.ylabel('Vocabulary Size') + plt.xlabel('Collection Size') + plt.title("Heaps' law") + + for idx, (label, (x, y)) in enumerate(self.data.items()): + step -= 1 + i = 0 + print(x) + filtered_x, filtered_y = [], [] + while i < len(x): + + filtered_x.append(x[i]) + filtered_y.append(y[i]) + i += step + plt.plot(filtered_x, filtered_y, color=colors[idx], label=label) + beta , logk = np.polyfit(filtered_x, filtered_y, 1) + print(f"Slope for {label}: {beta}") + print(f"Slope for {label}: {10**logk}") + + + + + plt.legend() + plt.grid(True) + plt.savefig('-loglog.pdf', transparent=True) + plt.show() + + +onehu = pubMed() +files = [ 'heapLawData-selectedPromt.pkl' , "heaplaw125m.pkl","heaplaw1.3b.pkl"] +labels = ["PubMed","125m","1.3b" ] +for file, label in zip(files, labels): + onehu.loadDocument(file, label) + +onehu.DrawHeapLaw(10) diff --git a/gitHubOfficalCode/cleanData.py b/gitHubOfficalCode/cleanData.py new file mode 100644 index 0000000..9dd7e60 --- /dev/null +++ b/gitHubOfficalCode/cleanData.py @@ -0,0 +1,23 @@ +import pickle +import random +# Make sure processData.py is in the same directory or in the PYTHONPATH +from processData import process_data + +# Path to the original and new pickle files +input_path = 'dataGenaration/result/result-gptNeo2.7b.pkl' +output_path = 'data/proccessData2.7.pkl' + +# Load the original pickle file +with open(input_path, 'rb') as file: + original_data = pickle.load(file) +print(original_data) +random.shuffle(original_data) +# Process the original data using the imported function +processed_data = [process_data(item) for item in original_data] + +# Now, we will save the processed data as an array of arrays +with open(output_path, 'wb') as outfile: + pickle.dump(processed_data, outfile) + +print("Data processing complete and saved to proccessData125m.pkl") + diff --git a/gitHubOfficalCode/data/test.jsonl b/gitHubOfficalCode/data/test.jsonl new file mode 100644 index 0000000..94b381f --- /dev/null +++ b/gitHubOfficalCode/data/test.jsonl @@ -0,0 +1,3 @@ +{"meta": {"pmid": 11409574, "language": "eng"}, "text": " in children with ARI and relative risks for the association "} +{"meta": {"pmid": 11409574, "language": "eng"}, "text": " in children with ARI and relative risks for the association "} +{"meta": {"pmid": 11409574, "language": "eng"}, "text": " in children with ARI and relative risks for the association "} diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/decode.py b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/decode.py new file mode 100644 index 0000000..b557b0c --- /dev/null +++ b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/decode.py @@ -0,0 +1,39 @@ +import pickle +from transformers import GPT2Tokenizer + +def decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file): + tokenizer = GPT2Tokenizer.from_pretrained(model_name) + + all_decoded_texts = [] + + for i in range(start_idx, end_idx + 1): + input_file = f"{input_folder}/gpt-neo-125m-{i}.pkl" + print(f"Processing {input_file}...") + + # Load the generated text (as token IDs) + try: + with open(input_file, 'rb') as f: + generated_text_ids = pickle.load(f) + + # Decode the text + decoded_texts = [tokenizer.decode(text_id, skip_special_tokens=True) for text_id in generated_text_ids] + all_decoded_texts.extend(decoded_texts) + except FileNotFoundError: + print(f"File not found: {input_file}") + except Exception as e: + print(f"An error occurred while processing {input_file}: {str(e)}") + + # Save all decoded text to a file + with open(output_file, 'wb') as f: + pickle.dump(all_decoded_texts, f) + + print(f"Decoded texts saved to {output_file}") + +if __name__ == '__main__': + model_name = 'EleutherAI/gpt-neo-125M' + start_idx = 2 + end_idx = 20 + input_folder = 'dataGenaration/gpt-neo-125m' + output_file = 'dataGenaration/result/result-gptNeo125m.pkl' + decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file) + diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh new file mode 100644 index 0000000..af0d50e --- /dev/null +++ b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh @@ -0,0 +1,105 @@ + +# Number of scripts you want to create +num_scripts=20 +step=25000 + +for i in $(seq 1 $num_scripts); do + start_point=$(( ($i-1) * $step )) + end_point=$(( $i * $step )) + file_name="gpt-neo-125m-$i.py" + output_pickle_file="gpt-neo-125m-$i.pkl" + + # Generate the Python script + cat > $file_name < max_position_embeddings: + raise ValueError(f"Input IDs length ({batch_input_ids.shape[1]}) exceeds model's max position embeddings ({max_position_embeddings}).") + + max_desired_length = max(batch_lengths).item() + total_length = batch_input_ids.shape[1] + max_desired_length + + if total_length > max_position_embeddings: + print(f"Warning: Desired total length ({total_length}) exceeds model's max position embeddings ({max_position_embeddings}). Truncating to {max_position_embeddings}.") + total_length = max_position_embeddings + + generated = self.model.generate(batch_input_ids, + max_length=total_length, + pad_token_id=self.tokenizer.pad_token_id) + + outputs.extend(generated.cpu().numpy()) + + self.rawDoc = outputs + + def decode(self, data): + return self.tokenizer.decode(data) + + +if __name__ == '__main__': + # Set up start and end points, model, tokenizer, and device + start_point = $start_point + end_point = $end_point + output_pickle_file = '$output_pickle_file' + model_name = 'EleutherAI/gpt-neo-125m' + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Initialize model and tokenizer + model = GPTNeoForCausalLM.from_pretrained(model_name).to(device) + tokenizer = GPT2Tokenizer.from_pretrained(model_name) + + # Set padding token + tokenizer.pad_token = tokenizer.eos_token + + # Initialize LLMsGeneration instance + llms_generation = LLMsGeneration(model, tokenizer, device, start_point, end_point) + + # Load array and generate text + llms_generation.loadArray('data/promptSelection.pickle') + + # Save the generated text + with open(output_pickle_file, 'wb') as file: + pickle.dump(llms_generation.rawDoc, file) + +EOF + +done diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh new file mode 100644 index 0000000..fa214e9 --- /dev/null +++ b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +for i in {1..20} +do + output_file="slurm-job-${i}.sh" + echo "Creating $output_file..." + + cat < $output_file +#!/bin/bash +#SBATCH --account=your_account_name +#SBATCH --gpus-per-node=a100:1 +#SBATCH --nodes=1 +#SBATCH --cpus-per-task=3 +#SBATCH --mem=70G +#SBATCH --time=0-15:00 +module load python/3.11.2 +module load StdEnv/2020 +python gpt-neo-125m-${i}.py + +EOT +done + diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh new file mode 100644 index 0000000..3f08cd4 --- /dev/null +++ b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +# Loop through the generated slurm job scripts and submit them +for i in {2..20} +do + slurm_file="slurm-job-${i}.sh" + echo "Submitting $slurm_file..." + sbatch $slurm_file +done + +echo "All jobs submitted." + diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/decode.py b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/decode.py new file mode 100644 index 0000000..b557b0c --- /dev/null +++ b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/decode.py @@ -0,0 +1,39 @@ +import pickle +from transformers import GPT2Tokenizer + +def decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file): + tokenizer = GPT2Tokenizer.from_pretrained(model_name) + + all_decoded_texts = [] + + for i in range(start_idx, end_idx + 1): + input_file = f"{input_folder}/gpt-neo-125m-{i}.pkl" + print(f"Processing {input_file}...") + + # Load the generated text (as token IDs) + try: + with open(input_file, 'rb') as f: + generated_text_ids = pickle.load(f) + + # Decode the text + decoded_texts = [tokenizer.decode(text_id, skip_special_tokens=True) for text_id in generated_text_ids] + all_decoded_texts.extend(decoded_texts) + except FileNotFoundError: + print(f"File not found: {input_file}") + except Exception as e: + print(f"An error occurred while processing {input_file}: {str(e)}") + + # Save all decoded text to a file + with open(output_file, 'wb') as f: + pickle.dump(all_decoded_texts, f) + + print(f"Decoded texts saved to {output_file}") + +if __name__ == '__main__': + model_name = 'EleutherAI/gpt-neo-125M' + start_idx = 2 + end_idx = 20 + input_folder = 'dataGenaration/gpt-neo-125m' + output_file = 'dataGenaration/result/result-gptNeo125m.pkl' + decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file) + diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_python_scripts.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_python_scripts.sh new file mode 100644 index 0000000..af0d50e --- /dev/null +++ b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_python_scripts.sh @@ -0,0 +1,105 @@ + +# Number of scripts you want to create +num_scripts=20 +step=25000 + +for i in $(seq 1 $num_scripts); do + start_point=$(( ($i-1) * $step )) + end_point=$(( $i * $step )) + file_name="gpt-neo-125m-$i.py" + output_pickle_file="gpt-neo-125m-$i.pkl" + + # Generate the Python script + cat > $file_name < max_position_embeddings: + raise ValueError(f"Input IDs length ({batch_input_ids.shape[1]}) exceeds model's max position embeddings ({max_position_embeddings}).") + + max_desired_length = max(batch_lengths).item() + total_length = batch_input_ids.shape[1] + max_desired_length + + if total_length > max_position_embeddings: + print(f"Warning: Desired total length ({total_length}) exceeds model's max position embeddings ({max_position_embeddings}). Truncating to {max_position_embeddings}.") + total_length = max_position_embeddings + + generated = self.model.generate(batch_input_ids, + max_length=total_length, + pad_token_id=self.tokenizer.pad_token_id) + + outputs.extend(generated.cpu().numpy()) + + self.rawDoc = outputs + + def decode(self, data): + return self.tokenizer.decode(data) + + +if __name__ == '__main__': + # Set up start and end points, model, tokenizer, and device + start_point = $start_point + end_point = $end_point + output_pickle_file = '$output_pickle_file' + model_name = 'EleutherAI/gpt-neo-125m' + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Initialize model and tokenizer + model = GPTNeoForCausalLM.from_pretrained(model_name).to(device) + tokenizer = GPT2Tokenizer.from_pretrained(model_name) + + # Set padding token + tokenizer.pad_token = tokenizer.eos_token + + # Initialize LLMsGeneration instance + llms_generation = LLMsGeneration(model, tokenizer, device, start_point, end_point) + + # Load array and generate text + llms_generation.loadArray('data/promptSelection.pickle') + + # Save the generated text + with open(output_pickle_file, 'wb') as file: + pickle.dump(llms_generation.rawDoc, file) + +EOF + +done diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh new file mode 100644 index 0000000..fa214e9 --- /dev/null +++ b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +for i in {1..20} +do + output_file="slurm-job-${i}.sh" + echo "Creating $output_file..." + + cat < $output_file +#!/bin/bash +#SBATCH --account=your_account_name +#SBATCH --gpus-per-node=a100:1 +#SBATCH --nodes=1 +#SBATCH --cpus-per-task=3 +#SBATCH --mem=70G +#SBATCH --time=0-15:00 +module load python/3.11.2 +module load StdEnv/2020 +python gpt-neo-125m-${i}.py + +EOT +done + diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/submit_all_jobs.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/submit_all_jobs.sh new file mode 100644 index 0000000..3f08cd4 --- /dev/null +++ b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/submit_all_jobs.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +# Loop through the generated slurm job scripts and submit them +for i in {2..20} +do + slurm_file="slurm-job-${i}.sh" + echo "Submitting $slurm_file..." + sbatch $slurm_file +done + +echo "All jobs submitted." + diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/decode.py b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/decode.py new file mode 100644 index 0000000..b557b0c --- /dev/null +++ b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/decode.py @@ -0,0 +1,39 @@ +import pickle +from transformers import GPT2Tokenizer + +def decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file): + tokenizer = GPT2Tokenizer.from_pretrained(model_name) + + all_decoded_texts = [] + + for i in range(start_idx, end_idx + 1): + input_file = f"{input_folder}/gpt-neo-125m-{i}.pkl" + print(f"Processing {input_file}...") + + # Load the generated text (as token IDs) + try: + with open(input_file, 'rb') as f: + generated_text_ids = pickle.load(f) + + # Decode the text + decoded_texts = [tokenizer.decode(text_id, skip_special_tokens=True) for text_id in generated_text_ids] + all_decoded_texts.extend(decoded_texts) + except FileNotFoundError: + print(f"File not found: {input_file}") + except Exception as e: + print(f"An error occurred while processing {input_file}: {str(e)}") + + # Save all decoded text to a file + with open(output_file, 'wb') as f: + pickle.dump(all_decoded_texts, f) + + print(f"Decoded texts saved to {output_file}") + +if __name__ == '__main__': + model_name = 'EleutherAI/gpt-neo-125M' + start_idx = 2 + end_idx = 20 + input_folder = 'dataGenaration/gpt-neo-125m' + output_file = 'dataGenaration/result/result-gptNeo125m.pkl' + decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file) + diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh new file mode 100644 index 0000000..af0d50e --- /dev/null +++ b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh @@ -0,0 +1,105 @@ + +# Number of scripts you want to create +num_scripts=20 +step=25000 + +for i in $(seq 1 $num_scripts); do + start_point=$(( ($i-1) * $step )) + end_point=$(( $i * $step )) + file_name="gpt-neo-125m-$i.py" + output_pickle_file="gpt-neo-125m-$i.pkl" + + # Generate the Python script + cat > $file_name < max_position_embeddings: + raise ValueError(f"Input IDs length ({batch_input_ids.shape[1]}) exceeds model's max position embeddings ({max_position_embeddings}).") + + max_desired_length = max(batch_lengths).item() + total_length = batch_input_ids.shape[1] + max_desired_length + + if total_length > max_position_embeddings: + print(f"Warning: Desired total length ({total_length}) exceeds model's max position embeddings ({max_position_embeddings}). Truncating to {max_position_embeddings}.") + total_length = max_position_embeddings + + generated = self.model.generate(batch_input_ids, + max_length=total_length, + pad_token_id=self.tokenizer.pad_token_id) + + outputs.extend(generated.cpu().numpy()) + + self.rawDoc = outputs + + def decode(self, data): + return self.tokenizer.decode(data) + + +if __name__ == '__main__': + # Set up start and end points, model, tokenizer, and device + start_point = $start_point + end_point = $end_point + output_pickle_file = '$output_pickle_file' + model_name = 'EleutherAI/gpt-neo-125m' + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Initialize model and tokenizer + model = GPTNeoForCausalLM.from_pretrained(model_name).to(device) + tokenizer = GPT2Tokenizer.from_pretrained(model_name) + + # Set padding token + tokenizer.pad_token = tokenizer.eos_token + + # Initialize LLMsGeneration instance + llms_generation = LLMsGeneration(model, tokenizer, device, start_point, end_point) + + # Load array and generate text + llms_generation.loadArray('data/promptSelection.pickle') + + # Save the generated text + with open(output_pickle_file, 'wb') as file: + pickle.dump(llms_generation.rawDoc, file) + +EOF + +done diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh new file mode 100644 index 0000000..fa214e9 --- /dev/null +++ b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +for i in {1..20} +do + output_file="slurm-job-${i}.sh" + echo "Creating $output_file..." + + cat < $output_file +#!/bin/bash +#SBATCH --account=your_account_name +#SBATCH --gpus-per-node=a100:1 +#SBATCH --nodes=1 +#SBATCH --cpus-per-task=3 +#SBATCH --mem=70G +#SBATCH --time=0-15:00 +module load python/3.11.2 +module load StdEnv/2020 +python gpt-neo-125m-${i}.py + +EOT +done + diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh new file mode 100644 index 0000000..3f08cd4 --- /dev/null +++ b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +# Loop through the generated slurm job scripts and submit them +for i in {2..20} +do + slurm_file="slurm-job-${i}.sh" + echo "Submitting $slurm_file..." + sbatch $slurm_file +done + +echo "All jobs submitted." + diff --git a/gitHubOfficalCode/heaplaw.py b/gitHubOfficalCode/heaplaw.py new file mode 100644 index 0000000..ed43c50 --- /dev/null +++ b/gitHubOfficalCode/heaplaw.py @@ -0,0 +1,39 @@ +import pickle +import random + +def process_data(input_file): + # Initialize counters + overall_total_words = 0 + overall_unique_words = set() + + with open(input_file, 'rb') as f: + data = pickle.load(f) + + # Shuffle the main data array + random.shuffle(data) + + # Initialize the result array + result = [] + + for word_array in data: + # Update the overall counters + overall_total_words += len(word_array) + overall_unique_words.update(word_array) + + # Append the current count and unique count to result + result.append([overall_total_words, len(overall_unique_words)]) + + return result + +if __name__ == '__main__': + input_file = 'data/proccessData1.3.pkl' + output_file = 'dataGenaration/result/heaplaw1.3b.pkl' + + result = process_data(input_file) + + # Save the result array to a file + with open(output_file, 'wb') as f: + pickle.dump(result, f) + + print(f"Result saved to {output_file}") + diff --git a/gitHubOfficalCode/processData.py b/gitHubOfficalCode/processData.py new file mode 100644 index 0000000..361cfd3 --- /dev/null +++ b/gitHubOfficalCode/processData.py @@ -0,0 +1,14 @@ +import re +import unicodedata + +def process_data(data): + # Normalize data + normalized_data = unicodedata.normalize('NFKD', data).encode('ascii', 'ignore').decode('utf-8', 'ignore') + # Convert to lowercase + normalized_data = normalized_data.lower() + # Remove punctuation + normalized_data = re.sub(r'[^\w\s]', '', normalized_data).strip() + # Tokenize + tokens = normalized_data.split() + return tokens + diff --git a/gitHubOfficalCode/promtSelection.py b/gitHubOfficalCode/promtSelection.py new file mode 100644 index 0000000..3623d54 --- /dev/null +++ b/gitHubOfficalCode/promtSelection.py @@ -0,0 +1,48 @@ +from transformers import GPT2Tokenizer +import pickle + +def create_prompts(data, prompt_length=5, tokenizer=None): + prompts = [] + for item in data: + if tokenizer is None: + raise ValueError("Tokenizer must be provided") + + tokenized_item = tokenizer(item, add_special_tokens=False) + tokenized_length = len(tokenized_item['input_ids']) + + if tokenized_length >= prompt_length: + prompt = item[:prompt_length] + else: + prompt = item + + prompts.append((prompt, tokenized_length)) + return prompts + +def main(): + data_folder = '/data' + processData_file = f'{data_folder}/processData.pickle' + promptSelection_file = f'{data_folder}/promptSelection.pickle' + model_name = 'EleutherAI/gpt-neo-125m' + + tokenizer = GPT2Tokenizer.from_pretrained(model_name) + tokenizer.pad_token = tokenizer.eos_token + + # Step 1: Read Data + with open(processData_file, 'rb') as f: + processed_data = pickle.load(f) + + # Select only the first 2 items for quick testing + + # Step 2: Create Prompts and Store Tokenized Length + prompts = create_prompts(processed_data, 10, tokenizer) + + # Step 3: Save Prompt Data to File + with open(promptSelection_file, 'wb') as f: + pickle.dump(prompts, f) + + print(f"Prompt data has been saved to '{promptSelection_file}'") + + +if __name__ == "__main__": + main() + From 7cf39574f00efb9b11e67bd48ad697ba4a82e40b Mon Sep 17 00:00:00 2001 From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com> Date: Fri, 3 Nov 2023 16:22:46 -0300 Subject: [PATCH 03/25] delete unrealatething --- .../DataSelection.py => DataSelection.py | 0 ...timation.py => DrawThePlotAndEstimation.py | 0 .../cleanData.py => cleanData.py | 0 {gitHubOfficalCode/data => data}/test.jsonl | 0 .../gpt-neo-1.3b/decode.py | 0 .../gpt-neo-1.3b/generate_python_scripts.sh | 0 .../gpt-neo-1.3b/generate_slurm_scripts.sh | 0 .../gpt-neo-1.3b/submit_all_jobs.sh | 0 .../gpt-neo-125m/decode.py | 0 .../gpt-neo-125m/generate_python_scripts.sh | 0 .../gpt-neo-125m/generate_slurm_scripts.sh | 0 .../gpt-neo-125m/submit_all_jobs.sh | 0 .../gpt-neo-2.7b/decode.py | 0 .../gpt-neo-2.7b/generate_python_scripts.sh | 0 .../gpt-neo-2.7b/generate_slurm_scripts.sh | 0 .../gpt-neo-2.7b/submit_all_jobs.sh | 0 gitHubOfficalCode/.idea/.gitignore | 3 - gitHubOfficalCode/.idea/gitHubOfficalCode.iml | 8 - .../inspectionProfiles/Project_Default.xml | 157 ------------------ .../inspectionProfiles/profiles_settings.xml | 6 - gitHubOfficalCode/.idea/misc.xml | 4 - gitHubOfficalCode/.idea/modules.xml | 8 - gitHubOfficalCode/heaplaw.py => heaplaw.py | 0 .../processData.py => processData.py | 0 .../promtSelection.py => promtSelection.py | 0 25 files changed, 186 deletions(-) rename gitHubOfficalCode/DataSelection.py => DataSelection.py (100%) rename gitHubOfficalCode/DrawThePlotAndEstimation.py => DrawThePlotAndEstimation.py (100%) rename gitHubOfficalCode/cleanData.py => cleanData.py (100%) rename {gitHubOfficalCode/data => data}/test.jsonl (100%) rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-1.3b/decode.py (100%) rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-1.3b/generate_python_scripts.sh (100%) rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-1.3b/generate_slurm_scripts.sh (100%) rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-1.3b/submit_all_jobs.sh (100%) rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-125m/decode.py (100%) rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-125m/generate_python_scripts.sh (100%) rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-125m/generate_slurm_scripts.sh (100%) rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-125m/submit_all_jobs.sh (100%) rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-2.7b/decode.py (100%) rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-2.7b/generate_python_scripts.sh (100%) rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-2.7b/generate_slurm_scripts.sh (100%) rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-2.7b/submit_all_jobs.sh (100%) delete mode 100644 gitHubOfficalCode/.idea/.gitignore delete mode 100644 gitHubOfficalCode/.idea/gitHubOfficalCode.iml delete mode 100644 gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml delete mode 100644 gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml delete mode 100644 gitHubOfficalCode/.idea/misc.xml delete mode 100644 gitHubOfficalCode/.idea/modules.xml rename gitHubOfficalCode/heaplaw.py => heaplaw.py (100%) rename gitHubOfficalCode/processData.py => processData.py (100%) rename gitHubOfficalCode/promtSelection.py => promtSelection.py (100%) diff --git a/gitHubOfficalCode/DataSelection.py b/DataSelection.py similarity index 100% rename from gitHubOfficalCode/DataSelection.py rename to DataSelection.py diff --git a/gitHubOfficalCode/DrawThePlotAndEstimation.py b/DrawThePlotAndEstimation.py similarity index 100% rename from gitHubOfficalCode/DrawThePlotAndEstimation.py rename to DrawThePlotAndEstimation.py diff --git a/gitHubOfficalCode/cleanData.py b/cleanData.py similarity index 100% rename from gitHubOfficalCode/cleanData.py rename to cleanData.py diff --git a/gitHubOfficalCode/data/test.jsonl b/data/test.jsonl similarity index 100% rename from gitHubOfficalCode/data/test.jsonl rename to data/test.jsonl diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/decode.py b/dataGenaration/gpt-neo-1.3b/decode.py similarity index 100% rename from gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/decode.py rename to dataGenaration/gpt-neo-1.3b/decode.py diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh b/dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh similarity index 100% rename from gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh rename to dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh b/dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh similarity index 100% rename from gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh rename to dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh b/dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh similarity index 100% rename from gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh rename to dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/decode.py b/dataGenaration/gpt-neo-125m/decode.py similarity index 100% rename from gitHubOfficalCode/dataGenaration/gpt-neo-125m/decode.py rename to dataGenaration/gpt-neo-125m/decode.py diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_python_scripts.sh b/dataGenaration/gpt-neo-125m/generate_python_scripts.sh similarity index 100% rename from gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_python_scripts.sh rename to dataGenaration/gpt-neo-125m/generate_python_scripts.sh diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh b/dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh similarity index 100% rename from gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh rename to dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/submit_all_jobs.sh b/dataGenaration/gpt-neo-125m/submit_all_jobs.sh similarity index 100% rename from gitHubOfficalCode/dataGenaration/gpt-neo-125m/submit_all_jobs.sh rename to dataGenaration/gpt-neo-125m/submit_all_jobs.sh diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/decode.py b/dataGenaration/gpt-neo-2.7b/decode.py similarity index 100% rename from gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/decode.py rename to dataGenaration/gpt-neo-2.7b/decode.py diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh b/dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh similarity index 100% rename from gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh rename to dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh b/dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh similarity index 100% rename from gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh rename to dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh b/dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh similarity index 100% rename from gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh rename to dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh diff --git a/gitHubOfficalCode/.idea/.gitignore b/gitHubOfficalCode/.idea/.gitignore deleted file mode 100644 index 26d3352..0000000 --- a/gitHubOfficalCode/.idea/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml diff --git a/gitHubOfficalCode/.idea/gitHubOfficalCode.iml b/gitHubOfficalCode/.idea/gitHubOfficalCode.iml deleted file mode 100644 index d0876a7..0000000 --- a/gitHubOfficalCode/.idea/gitHubOfficalCode.iml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml b/gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml deleted file mode 100644 index 4ea2cf8..0000000 --- a/gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml +++ /dev/null @@ -1,157 +0,0 @@ - - - - \ No newline at end of file diff --git a/gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml b/gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/gitHubOfficalCode/.idea/misc.xml b/gitHubOfficalCode/.idea/misc.xml deleted file mode 100644 index a971a2c..0000000 --- a/gitHubOfficalCode/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/gitHubOfficalCode/.idea/modules.xml b/gitHubOfficalCode/.idea/modules.xml deleted file mode 100644 index c7b47ab..0000000 --- a/gitHubOfficalCode/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/gitHubOfficalCode/heaplaw.py b/heaplaw.py similarity index 100% rename from gitHubOfficalCode/heaplaw.py rename to heaplaw.py diff --git a/gitHubOfficalCode/processData.py b/processData.py similarity index 100% rename from gitHubOfficalCode/processData.py rename to processData.py diff --git a/gitHubOfficalCode/promtSelection.py b/promtSelection.py similarity index 100% rename from gitHubOfficalCode/promtSelection.py rename to promtSelection.py From 5185fc42bbaacfa6a1cec516e60f0886f6a059f2 Mon Sep 17 00:00:00 2001 From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com> Date: Mon, 20 Nov 2023 14:54:23 -0400 Subject: [PATCH 04/25] update the table --- DrawThePlotAndEstimation.py | 133 +++++++++++++++++++++--------------- 1 file changed, 78 insertions(+), 55 deletions(-) diff --git a/DrawThePlotAndEstimation.py b/DrawThePlotAndEstimation.py index 6e89451..045f543 100644 --- a/DrawThePlotAndEstimation.py +++ b/DrawThePlotAndEstimation.py @@ -1,57 +1,80 @@ -import math import pickle + +import pandas as pd import numpy as np -from matplotlib import pyplot as plt - -class pubMed: - def __init__(self): - self.data = {} - - def loadDocument(self, filename, label): - with open(filename, 'rb') as f: - xy = pickle.load(f) - x, y = [], [] - for xAndy in xy: - x.append(xAndy[0]) - y.append(xAndy[1]) - # x.append(math.log10(xAndy[0])) - # y.append(math.log10(xAndy[1])) - self.data[label] = (x, y) - - def DrawHeapLaw(self, step): - colors = ['r', 'g', 'b'] - plt.ylabel('Vocabulary Size') - plt.xlabel('Collection Size') - plt.title("Heaps' law") - - for idx, (label, (x, y)) in enumerate(self.data.items()): - step -= 1 - i = 0 - print(x) - filtered_x, filtered_y = [], [] - while i < len(x): - - filtered_x.append(x[i]) - filtered_y.append(y[i]) - i += step - plt.plot(filtered_x, filtered_y, color=colors[idx], label=label) - beta , logk = np.polyfit(filtered_x, filtered_y, 1) - print(f"Slope for {label}: {beta}") - print(f"Slope for {label}: {10**logk}") - - - - - plt.legend() - plt.grid(True) - plt.savefig('-loglog.pdf', transparent=True) - plt.show() - - -onehu = pubMed() -files = [ 'heapLawData-selectedPromt.pkl' , "heaplaw125m.pkl","heaplaw1.3b.pkl"] -labels = ["PubMed","125m","1.3b" ] -for file, label in zip(files, labels): - onehu.loadDocument(file, label) - -onehu.DrawHeapLaw(10) +import matplotlib.pyplot as plt +import seaborn as sns +import statsmodels.api as sm + + +def analyze_corpus(file_path, corpus_name): + # Read data + + + with open(file_path, 'rb') as f: + data = pickle.load(f) + data = data.drop(0) # Drop the first row + + # Fit model + X = sm.add_constant(np.log10(data['n'])) + model = sm.OLS(np.log10(data['m']), X).fit() + + # Summary statistics + alpha_hat = 10 ** model.params[0] + beta_hat = model.params[1] + rsq = model.rsquared + print(f"{corpus_name} - alpha (est): {alpha_hat:.4f}, beta (est): {beta_hat:.4f}, R squared: {rsq:.4f}") + + # Confidence intervals + conf_int = model.conf_int(alpha=0.1) # 90% CI + alpha_ci_low, alpha_ci_high = 10 ** conf_int.iloc[0, 0], 10 ** conf_int.iloc[0, 1] + beta_ci_low, beta_ci_high = conf_int.iloc[1, 0], conf_int.iloc[1, 1] + print(f"{corpus_name} - 90% CI for alpha: [{alpha_ci_low:.4f}, {alpha_ci_high:.4f}]") + print(f"{corpus_name} - 90% CI for beta: [{beta_ci_low:.4f}, {beta_ci_high:.4f}]") + + return data, model.params + + + + +pubmed_data, pubmed_params = analyze_corpus('data/pubmed.pkl', 'PubMed') +gptneo125m_data, gptneo125m_params = analyze_corpus('data/heapLawData-gptneo125m.pkl', 'GPT-Neo-125m') +gptneo13b_data, gptneo13b_params = analyze_corpus('data/heapLawData-gptneo1.3B.pkl', 'GPT-Neo-1.3B') +gptneo27b_data, gptneo27b_params = analyze_corpus('data/heapLawData-gptneo2.7B.pkl', 'GPT-Neo-2.7B') + + + +def plot_corpus(data, params, corpus_name, ax, log_scale=False): + if log_scale: + sns.lineplot(x=np.log10(data['n']), y=np.log10(data['m']), ax=ax, label=f"{corpus_name}: β={params[1]:.4f}") + else: + sns.lineplot(x=data['n'], y=data['m'], ax=ax, label=f"{corpus_name}: β={params[1]:.4f}") + +# Natural Scale Plot +plt.figure(figsize=(12, 8)) +ax1 = plt.subplot(1, 2, 1) +plot_corpus(pubmed_data, pubmed_params, 'PubMed', ax1) +plot_corpus(gptneo125m_data, gptneo125m_params, 'GPT-Neo-125m', ax1) +plot_corpus(gptneo13b_data, gptneo13b_params, 'GPT-Neo-1.3B', ax1) +plot_corpus(gptneo27b_data, gptneo27b_params, 'GPT-Neo-2.7B', ax1) +plt.title("Heaps' Law - Natural Scale") +plt.xlabel('Total Words') +plt.ylabel('Vocabulary Size') +plt.legend() + +# Log-Log Scale Plot +ax2 = plt.subplot(1, 2, 2) +plot_corpus(pubmed_data, pubmed_params, 'PubMed', ax2, log_scale=True) +plot_corpus(gptneo125m_data, gptneo125m_params, 'GPT-Neo-125m', ax2, log_scale=True) +plot_corpus(gptneo13b_data, gptneo13b_params, 'GPT-Neo-1.3B', ax2, log_scale=True) +plot_corpus(gptneo27b_data, gptneo27b_params, 'GPT-Neo-2.7B', ax2, log_scale=True) +plt.title("Heaps' Law - Log-Log Scale") +plt.xlabel('Log Total Words') +plt.ylabel('Log Vocabulary Size') +plt.legend() + +plt.tight_layout() +# plt.show() + + + From eeabf42dba5d12083a5ea87907957e2bd1036dc9 Mon Sep 17 00:00:00 2001 From: uyen lai <53119641+rachelxx03@users.noreply.github.com> Date: Thu, 23 Nov 2023 06:11:58 -0400 Subject: [PATCH 05/25] Update README.md --- README.md | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4a99f5f..2e3aaa2 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,94 @@ # Heaps' Law in GPT-Neo Large Language Model Emulated Corpora - Official repository for the workshop paper Heaps' Law in GPT-Neo Large Language Model Emulated Corpora +ArXiv preprint link: https://arxiv.org/abs/2311.06377v1 + +## Getting Started + +Clone this repository by running the command +``` +git clone https://github.com/paul-sheridan/paper-heaps-law-llm.git +``` +and `cd` into the repository root folder `paper-heaps-law-llm`. + +## Data + + +We download the data from The Pile, it is a big data set contain of many small data sets and **PubMed** is one of them. +You can choose whatever dataset you want from here. +https://pile.eleuther.ai/ + +## Prepare the envinrontment + +Repository code is written in Python 3 using Narval Cluster provided by Digital Research Alliance of Canada(https://docs.alliancecan.ca/wiki/Getting_started). +While there are multiple ways to run a repository, here is one way to do it using Narval: + +From the command line, create a virtual environment: + +``` +virtualenv /project/def-yourName/yourDirctory +``` + +## Running Repository Code + +**DATA SELECTION** +In this research we process 500.000 PubMed documents, so you can navigate to the file and change the amount of document or the document you want to process +``` +python dataSelection +``` + +**Clean Data** +clean the data using the method we mention in the paper +``` +python cleanData.py +``` + +**Promt Selection** +Choose the seed for the LLMs +``` +python promtSelection.py +``` + +**Data Genaration** +Generate data from LLMS using the seed we created. + +Navigate to folder gpt-neo-125m, gpt-neo-1.3b, gpt-neo-2.7b. +``` +.\generate_python_scripts.sh +.\generate_slurm_scripts.sh +.\submit_all_jobs.sh +``` + +After that navigate to each folder gpt-neo-125m, gpt-neo-1.3b, gpt-neo-2.7b. and run +``` +python decode.py +``` +and we use the same cleaning data strageries to clean the data from LLMs + +**Heap's law data calculation** +heap's law need number of vocabulary and number of total word in documents so we need to navigate and produce the result use: +``` +python heaplaw.py +``` + +**Heap's law visualization** +generate the plot using +``` +python drawThePlotAndEstimation.py +``` + + + +**** + + + +## Citation +If you find anything useful please cite our work using: +``` +@misc{SarriaHurtado2023, + author = {Uyen Lai, Gurjit S. Randhawa, Paul Sheridan}, + title = {Heaps' Law in GPT-Neo Large Language Model Emulated Corpora}, + year = {2023}, + eprint = {arXiv:2311.06377v1} +} +``` From 6c2b9d3c7056186dfaedeeb297e0d9cde74778bc Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Thu, 23 Nov 2023 07:45:14 -0400 Subject: [PATCH 06/25] fixed citation --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2e3aaa2..51f6e39 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ python drawThePlotAndEstimation.py ## Citation If you find anything useful please cite our work using: ``` -@misc{SarriaHurtado2023, +@misc{Lai2023, author = {Uyen Lai, Gurjit S. Randhawa, Paul Sheridan}, title = {Heaps' Law in GPT-Neo Large Language Model Emulated Corpora}, year = {2023}, From 4919772739a6b9e127fd14108ee61b7b80af80cc Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Thu, 23 Nov 2023 15:01:07 -0400 Subject: [PATCH 07/25] copy edits --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 51f6e39..0c3847f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,5 @@ # Heaps' Law in GPT-Neo Large Language Model Emulated Corpora -Official repository for the workshop paper Heaps' Law in GPT-Neo Large Language Model Emulated Corpora -ArXiv preprint link: https://arxiv.org/abs/2311.06377v1 +This repository contains computer code for reproducing the results described in the [EVIA 2023 Workshop](https://research.nii.ac.jp/ntcir/evia2023/) paper "Heaps' Law in GPT-Neo Large Language Model Emulated Corpora". ArXiv preprint link: https://arxiv.org/abs/2311.06377v1 ## Getting Started From 1ccac3146391777a087811496265a645de0c6f15 Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Thu, 23 Nov 2023 15:06:39 -0400 Subject: [PATCH 08/25] copy edits --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0c3847f..8301ba3 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,11 @@ git clone https://github.com/paul-sheridan/paper-heaps-law-llm.git ``` and `cd` into the repository root folder `paper-heaps-law-llm`. -## Data +## Obtaining the Data + +Download the **Pubmed Abstracts** component data from The Pile (https://pile.eleuther.ai/), an 800GB dataset of diverse text for language modeling. -We download the data from The Pile, it is a big data set contain of many small data sets and **PubMed** is one of them. -You can choose whatever dataset you want from here. -https://pile.eleuther.ai/ ## Prepare the envinrontment From 7ed13586d3d55fe1ba97b3895b939c67023d6cd3 Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Thu, 23 Nov 2023 15:07:37 -0400 Subject: [PATCH 09/25] copy edits --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8301ba3..26e89e6 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,10 @@ and `cd` into the repository root folder `paper-heaps-law-llm`. ## Obtaining the Data -Download the **Pubmed Abstracts** component data from The Pile (https://pile.eleuther.ai/), an 800GB dataset of diverse text for language modeling. +Download the **Pubmed Abstracts** component dataset from The Pile ([download page](https://pile.eleuther.ai/)), an 800GB dataset of diverse text for language modeling. -## Prepare the envinrontment +## Prepare the Envinronment Repository code is written in Python 3 using Narval Cluster provided by Digital Research Alliance of Canada(https://docs.alliancecan.ca/wiki/Getting_started). While there are multiple ways to run a repository, here is one way to do it using Narval: From 45a2282ac5f01168e3abbce8b01a69e14b7f2b21 Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Thu, 23 Nov 2023 15:08:08 -0400 Subject: [PATCH 10/25] copy edits --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 26e89e6..17f41da 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ and `cd` into the repository root folder `paper-heaps-law-llm`. Download the **Pubmed Abstracts** component dataset from The Pile ([download page](https://pile.eleuther.ai/)), an 800GB dataset of diverse text for language modeling. -## Prepare the Envinronment +## Preparing the Environment Repository code is written in Python 3 using Narval Cluster provided by Digital Research Alliance of Canada(https://docs.alliancecan.ca/wiki/Getting_started). While there are multiple ways to run a repository, here is one way to do it using Narval: From f65342ea1e74e5ba8d87850da897bc93918fcd38 Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Thu, 23 Nov 2023 15:08:38 -0400 Subject: [PATCH 11/25] copy edits --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 17f41da..a5d39df 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ and `cd` into the repository root folder `paper-heaps-law-llm`. ## Obtaining the Data -Download the **Pubmed Abstracts** component dataset from The Pile ([download page](https://pile.eleuther.ai/)), an 800GB dataset of diverse text for language modeling. +Download the **Pubmed Abstracts** component corpus from The Pile ([download page](https://pile.eleuther.ai/)), an 800GB dataset of diverse text for language modeling. ## Preparing the Environment From 9b23ae0d7ba786c17247b16ecd3b9e7ccb8c159e Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Thu, 23 Nov 2023 15:09:51 -0400 Subject: [PATCH 12/25] copy edits --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a5d39df..8a7b248 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Heaps' Law in GPT-Neo Large Language Model Emulated Corpora -This repository contains computer code for reproducing the results described in the [EVIA 2023 Workshop](https://research.nii.ac.jp/ntcir/evia2023/) paper "Heaps' Law in GPT-Neo Large Language Model Emulated Corpora". ArXiv preprint link: https://arxiv.org/abs/2311.06377v1 +This repository contains computer code for reproducing the results described in the EVIA 2023 Workshop ([landing page](https://research.nii.ac.jp/ntcir/evia2023/)) paper "Heaps' Law in GPT-Neo Large Language Model Emulated Corpora". ArXiv preprint link: https://arxiv.org/abs/2311.06377v1 ## Getting Started From 43a07d30b6fb7ec671f5454eecb7186ef90cef2e Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Fri, 24 Nov 2023 06:18:17 -0400 Subject: [PATCH 13/25] copy edits --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8a7b248..dd4c70f 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,9 @@ Download the **Pubmed Abstracts** component corpus from The Pile ([download page ## Preparing the Environment -Repository code is written in Python 3 using Narval Cluster provided by Digital Research Alliance of Canada(https://docs.alliancecan.ca/wiki/Getting_started). -While there are multiple ways to run a repository, here is one way to do it using Narval: +Repository code is written in Python 3. It was run on the Narval cluster ([Narval wiki page](https://docs.alliancecan.ca/wiki/Narval/en)), provided by Digital Research Alliance of Canada ([Getting started wiki page](https://docs.alliancecan.ca/wiki/Getting_started)). + +While there are multiple ways to run the repository code, here is one way to do it using Narval: From the command line, create a virtual environment: From ff1623a7357fce2b87c4b2b5527c2789cd7f6676 Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Fri, 24 Nov 2023 06:31:02 -0400 Subject: [PATCH 14/25] copy edits --- README.md | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index dd4c70f..2e9a9c6 100644 --- a/README.md +++ b/README.md @@ -29,25 +29,33 @@ virtualenv /project/def-yourName/yourDirctory ## Running Repository Code -**DATA SELECTION** -In this research we process 500.000 PubMed documents, so you can navigate to the file and change the amount of document or the document you want to process +### Data Selection + +In this research we analyze the first 500,000 abstracts in the PubMed Abstracts corpus. To prepare this dataset, run the `dataSelection.py` script: + ``` -python dataSelection +python dataSelection.py ``` -**Clean Data** -clean the data using the method we mention in the paper +To select a custom number of abstracts, navigate to the `dataSelection.py` script and set the `limit` variable on line 8 to be the number of documents that you want to process. + +### Data Preprocessing + +To preprocess the data according to the steps described in the paper, run: + ``` python cleanData.py ``` -**Promt Selection** +### Promt Selection + Choose the seed for the LLMs + ``` python promtSelection.py ``` -**Data Genaration** +### Data Genaration Generate data from LLMS using the seed we created. Navigate to folder gpt-neo-125m, gpt-neo-1.3b, gpt-neo-2.7b. @@ -63,13 +71,14 @@ python decode.py ``` and we use the same cleaning data strageries to clean the data from LLMs -**Heap's law data calculation** +### Heaps' Law Estimation + heap's law need number of vocabulary and number of total word in documents so we need to navigate and produce the result use: ``` python heaplaw.py ``` -**Heap's law visualization** +### Heaps' Law Visualization generate the plot using ``` python drawThePlotAndEstimation.py From 7d8f154b940a1967431c56deb6c8eb39f3093c85 Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Fri, 24 Nov 2023 06:37:32 -0400 Subject: [PATCH 15/25] copy edits --- README.md | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 2e9a9c6..e944ea1 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,6 @@ Repository code is written in Python 3. It was run on the Narval cluster ([Narva While there are multiple ways to run the repository code, here is one way to do it using Narval: From the command line, create a virtual environment: - ``` virtualenv /project/def-yourName/yourDirctory ``` @@ -32,7 +31,6 @@ virtualenv /project/def-yourName/yourDirctory ### Data Selection In this research we analyze the first 500,000 abstracts in the PubMed Abstracts corpus. To prepare this dataset, run the `dataSelection.py` script: - ``` python dataSelection.py ``` @@ -42,34 +40,32 @@ To select a custom number of abstracts, navigate to the `dataSelection.py` scrip ### Data Preprocessing To preprocess the data according to the steps described in the paper, run: - ``` python cleanData.py ``` -### Promt Selection - -Choose the seed for the LLMs +### Prompt Selection +To choose seed text for abstract emulation, run: ``` python promtSelection.py ``` ### Data Genaration -Generate data from LLMS using the seed we created. -Navigate to folder gpt-neo-125m, gpt-neo-1.3b, gpt-neo-2.7b. +To emulate text from the GPTNeo LLMs using the above generated seed texts, run the following shell scripts from inside each of the folders `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b`: ``` .\generate_python_scripts.sh .\generate_slurm_scripts.sh .\submit_all_jobs.sh ``` -After that navigate to each folder gpt-neo-125m, gpt-neo-1.3b, gpt-neo-2.7b. and run +After that navigate to each folder `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b` and run ``` python decode.py ``` -and we use the same cleaning data strageries to clean the data from LLMs +This script applies the same preprocesing strageries as used above. + ### Heaps' Law Estimation From 7f0842a6bac51a9234a9018182fa32ff5c92619c Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Fri, 24 Nov 2023 06:42:36 -0400 Subject: [PATCH 16/25] copy edits --- README.md | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index e944ea1..bc2a939 100644 --- a/README.md +++ b/README.md @@ -69,23 +69,20 @@ This script applies the same preprocesing strageries as used above. ### Heaps' Law Estimation -heap's law need number of vocabulary and number of total word in documents so we need to navigate and produce the result use: +To estimate the Heaps' law parameters for each GPTNeo model using simple linear regression, navigate to each folder `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b` and run ``` -python heaplaw.py +python heapsLaw.py ``` +The parameter estimates are found in Table 1 of the paper. ### Heaps' Law Visualization -generate the plot using + +To generate the plots of Figure 1 in the paper, run ``` python drawThePlotAndEstimation.py ``` - -**** - - - ## Citation If you find anything useful please cite our work using: ``` From 21ebbf77367845f7f65fe4d992c6658f9f022aa1 Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Fri, 24 Nov 2023 06:45:59 -0400 Subject: [PATCH 17/25] copy edits --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bc2a939..5c171eb 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ python promtSelection.py ### Data Genaration -To emulate text from the GPTNeo LLMs using the above generated seed texts, run the following shell scripts from inside each of the folders `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b`: +To emulate text from the GPT-Neo models using the above generated seed texts, run the following shell scripts from inside each of the folders `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b`: ``` .\generate_python_scripts.sh .\generate_slurm_scripts.sh @@ -69,7 +69,7 @@ This script applies the same preprocesing strageries as used above. ### Heaps' Law Estimation -To estimate the Heaps' law parameters for each GPTNeo model using simple linear regression, navigate to each folder `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b` and run +To estimate the Heaps' law parameters for each GPT-Neo model using simple linear regression, navigate to each folder `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b` and run ``` python heapsLaw.py ``` From 97c3d0e1b9f6ff3bb46746ee68bdf0415925a84e Mon Sep 17 00:00:00 2001 From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com> Date: Fri, 24 Nov 2023 15:11:18 -0400 Subject: [PATCH 18/25] update format update format for the file name --- DrawThePlotAndEstimation.py | 31 +++++++++++++++++-------------- heaplaw.py => heapsLaw.py | 0 2 files changed, 17 insertions(+), 14 deletions(-) rename heaplaw.py => heapsLaw.py (100%) diff --git a/DrawThePlotAndEstimation.py b/DrawThePlotAndEstimation.py index 045f543..d83618b 100644 --- a/DrawThePlotAndEstimation.py +++ b/DrawThePlotAndEstimation.py @@ -1,19 +1,18 @@ -import pickle - import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import statsmodels.api as sm - -def analyze_corpus(file_path, corpus_name): - # Read data - - - with open(file_path, 'rb') as f: - data = pickle.load(f) - data = data.drop(0) # Drop the first row +def analyze_corpus(file_path, corpus_name, file_type='csv'): + # Read data based on file type + if file_type == 'csv': + data = pd.read_csv(file_path, header=None, names=['n', 'm']) + data = data.drop(0) # Drop the first row + elif file_type == 'pkl': + data = pd.read_pickle(file_path) + else: + raise ValueError("Unsupported file type") # Fit model X = sm.add_constant(np.log10(data['n'])) @@ -34,13 +33,17 @@ def analyze_corpus(file_path, corpus_name): return data, model.params +pubmed_data, pubmed_params = analyze_corpus('data/pubmed.pkl', 'PubMed',file_type='csv') + +# Analyzing the GPT-Neo datasets (assuming they are in .pkl format) +gptneo125m_data, gptneo125m_params = analyze_corpus('data/heapLawData-gptneo125m.pkl', 'GPT-Neo-125m', file_type='pkl') +gptneo13b_data, gptneo13b_params = analyze_corpus('data/heapLawData-gptneo1.3B.pkl', 'GPT-Neo-1.3B', file_type='pkl') +gptneo27b_data, gptneo27b_params = analyze_corpus('data/heapLawData-gptneo2.7B.pkl', 'GPT-Neo-2.7B', file_type='pkl') + +# Rest of the code remains the same -pubmed_data, pubmed_params = analyze_corpus('data/pubmed.pkl', 'PubMed') -gptneo125m_data, gptneo125m_params = analyze_corpus('data/heapLawData-gptneo125m.pkl', 'GPT-Neo-125m') -gptneo13b_data, gptneo13b_params = analyze_corpus('data/heapLawData-gptneo1.3B.pkl', 'GPT-Neo-1.3B') -gptneo27b_data, gptneo27b_params = analyze_corpus('data/heapLawData-gptneo2.7B.pkl', 'GPT-Neo-2.7B') diff --git a/heaplaw.py b/heapsLaw.py similarity index 100% rename from heaplaw.py rename to heapsLaw.py From bd0f2146bd20bdc15d783f9a74f3503785d8f55c Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Fri, 24 Nov 2023 15:15:04 -0400 Subject: [PATCH 19/25] file name update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5c171eb..4fcfbd3 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ python cleanData.py To choose seed text for abstract emulation, run: ``` -python promtSelection.py +python promptSelection.py ``` ### Data Genaration From 3db2244cd4f4bdcf4e4cca50d902afe6a0a73dfc Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Fri, 24 Nov 2023 15:16:32 -0400 Subject: [PATCH 20/25] typo fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4fcfbd3..79b137b 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ While there are multiple ways to run the repository code, here is one way to do From the command line, create a virtual environment: ``` -virtualenv /project/def-yourName/yourDirctory +virtualenv /project/def-yourName/yourDirectory ``` ## Running Repository Code From cd24aaf7ca70c863837549193ac8bbeb53428044 Mon Sep 17 00:00:00 2001 From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com> Date: Fri, 24 Nov 2023 15:18:07 -0400 Subject: [PATCH 21/25] format --- .idea/.gitignore | 3 + .idea/inspectionProfiles/Project_Default.xml | 193 ++++++++++++++++++ .../inspectionProfiles/profiles_settings.xml | 6 + .idea/modules.xml | 8 + .idea/paper-heaps-law-llm.iml | 8 + .idea/vcs.xml | 6 + 6 files changed, 224 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/paper-heaps-law-llm.iml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..48285f1 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,193 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..03b57fd --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/paper-heaps-law-llm.iml b/.idea/paper-heaps-law-llm.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/paper-heaps-law-llm.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file From 4aec00d31a075d1542699b472a4e11bf3317c102 Mon Sep 17 00:00:00 2001 From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com> Date: Fri, 24 Nov 2023 15:23:31 -0400 Subject: [PATCH 22/25] change thr prompt file name --- promtSelection.py => promptSelection.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename promtSelection.py => promptSelection.py (100%) diff --git a/promtSelection.py b/promptSelection.py similarity index 100% rename from promtSelection.py rename to promptSelection.py From 713e981f3cedcfc0abf199472721d0ce47305184 Mon Sep 17 00:00:00 2001 From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com> Date: Fri, 24 Nov 2023 15:28:02 -0400 Subject: [PATCH 23/25] commit --- DrawThePlotAndEstimation.py | 83 ---------------------------- DataSelection.py => dataSelection.py | 0 drawThePlotAndEstimattion.py | 31 +++++++++++ 3 files changed, 31 insertions(+), 83 deletions(-) delete mode 100644 DrawThePlotAndEstimation.py rename DataSelection.py => dataSelection.py (100%) create mode 100644 drawThePlotAndEstimattion.py diff --git a/DrawThePlotAndEstimation.py b/DrawThePlotAndEstimation.py deleted file mode 100644 index d83618b..0000000 --- a/DrawThePlotAndEstimation.py +++ /dev/null @@ -1,83 +0,0 @@ -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -import seaborn as sns -import statsmodels.api as sm - -def analyze_corpus(file_path, corpus_name, file_type='csv'): - # Read data based on file type - if file_type == 'csv': - data = pd.read_csv(file_path, header=None, names=['n', 'm']) - data = data.drop(0) # Drop the first row - elif file_type == 'pkl': - data = pd.read_pickle(file_path) - else: - raise ValueError("Unsupported file type") - - # Fit model - X = sm.add_constant(np.log10(data['n'])) - model = sm.OLS(np.log10(data['m']), X).fit() - - # Summary statistics - alpha_hat = 10 ** model.params[0] - beta_hat = model.params[1] - rsq = model.rsquared - print(f"{corpus_name} - alpha (est): {alpha_hat:.4f}, beta (est): {beta_hat:.4f}, R squared: {rsq:.4f}") - - # Confidence intervals - conf_int = model.conf_int(alpha=0.1) # 90% CI - alpha_ci_low, alpha_ci_high = 10 ** conf_int.iloc[0, 0], 10 ** conf_int.iloc[0, 1] - beta_ci_low, beta_ci_high = conf_int.iloc[1, 0], conf_int.iloc[1, 1] - print(f"{corpus_name} - 90% CI for alpha: [{alpha_ci_low:.4f}, {alpha_ci_high:.4f}]") - print(f"{corpus_name} - 90% CI for beta: [{beta_ci_low:.4f}, {beta_ci_high:.4f}]") - - return data, model.params - -pubmed_data, pubmed_params = analyze_corpus('data/pubmed.pkl', 'PubMed',file_type='csv') - -# Analyzing the GPT-Neo datasets (assuming they are in .pkl format) -gptneo125m_data, gptneo125m_params = analyze_corpus('data/heapLawData-gptneo125m.pkl', 'GPT-Neo-125m', file_type='pkl') -gptneo13b_data, gptneo13b_params = analyze_corpus('data/heapLawData-gptneo1.3B.pkl', 'GPT-Neo-1.3B', file_type='pkl') -gptneo27b_data, gptneo27b_params = analyze_corpus('data/heapLawData-gptneo2.7B.pkl', 'GPT-Neo-2.7B', file_type='pkl') - -# Rest of the code remains the same - - - - - - -def plot_corpus(data, params, corpus_name, ax, log_scale=False): - if log_scale: - sns.lineplot(x=np.log10(data['n']), y=np.log10(data['m']), ax=ax, label=f"{corpus_name}: β={params[1]:.4f}") - else: - sns.lineplot(x=data['n'], y=data['m'], ax=ax, label=f"{corpus_name}: β={params[1]:.4f}") - -# Natural Scale Plot -plt.figure(figsize=(12, 8)) -ax1 = plt.subplot(1, 2, 1) -plot_corpus(pubmed_data, pubmed_params, 'PubMed', ax1) -plot_corpus(gptneo125m_data, gptneo125m_params, 'GPT-Neo-125m', ax1) -plot_corpus(gptneo13b_data, gptneo13b_params, 'GPT-Neo-1.3B', ax1) -plot_corpus(gptneo27b_data, gptneo27b_params, 'GPT-Neo-2.7B', ax1) -plt.title("Heaps' Law - Natural Scale") -plt.xlabel('Total Words') -plt.ylabel('Vocabulary Size') -plt.legend() - -# Log-Log Scale Plot -ax2 = plt.subplot(1, 2, 2) -plot_corpus(pubmed_data, pubmed_params, 'PubMed', ax2, log_scale=True) -plot_corpus(gptneo125m_data, gptneo125m_params, 'GPT-Neo-125m', ax2, log_scale=True) -plot_corpus(gptneo13b_data, gptneo13b_params, 'GPT-Neo-1.3B', ax2, log_scale=True) -plot_corpus(gptneo27b_data, gptneo27b_params, 'GPT-Neo-2.7B', ax2, log_scale=True) -plt.title("Heaps' Law - Log-Log Scale") -plt.xlabel('Log Total Words') -plt.ylabel('Log Vocabulary Size') -plt.legend() - -plt.tight_layout() -# plt.show() - - - diff --git a/DataSelection.py b/dataSelection.py similarity index 100% rename from DataSelection.py rename to dataSelection.py diff --git a/drawThePlotAndEstimattion.py b/drawThePlotAndEstimattion.py new file mode 100644 index 0000000..8152087 --- /dev/null +++ b/drawThePlotAndEstimattion.py @@ -0,0 +1,31 @@ +import json +import pickle +from processData import process_data + +def main(): + file_path = 'data/test.jsonl' + processed_data = [] + limit = 500000 + prompt_length = 5 + try: + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + data_chunk = json.loads(line).get('text', '') # Assuming each line is a JSON object with an 'abstract' field + processed_line = process_data(data_chunk) + if len(processed_line) > prompt_length: # Check if the processed line has more than 10 words + processed_data.append(processed_line) + if len(processed_data) >= limit: + break + with open('processData.pickle', 'wb') as f: + pickle.dump(processed_data, f) + print("Data has been processed and saved to 'processData.pickle'") + except FileNotFoundError: + print("The file was not found. Please check the file path.") + except json.JSONDecodeError: + print("Error decoding JSON. Please check the file content.") + except Exception as e: + print("An error occurred:", str(e)) + +if __name__ == "__main__": + main() + From 660dfbc46801f1b18967dad3e229db172886d04d Mon Sep 17 00:00:00 2001 From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com> Date: Fri, 24 Nov 2023 15:28:51 -0400 Subject: [PATCH 24/25] delete --- .idea/.gitignore | 3 - .idea/inspectionProfiles/Project_Default.xml | 193 ------------------ .../inspectionProfiles/profiles_settings.xml | 6 - .idea/modules.xml | 8 - .idea/paper-heaps-law-llm.iml | 8 - .idea/vcs.xml | 6 - 6 files changed, 224 deletions(-) delete mode 100644 .idea/.gitignore delete mode 100644 .idea/inspectionProfiles/Project_Default.xml delete mode 100644 .idea/inspectionProfiles/profiles_settings.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/paper-heaps-law-llm.iml delete mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 26d3352..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml deleted file mode 100644 index 48285f1..0000000 --- a/.idea/inspectionProfiles/Project_Default.xml +++ /dev/null @@ -1,193 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 03b57fd..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/paper-heaps-law-llm.iml b/.idea/paper-heaps-law-llm.iml deleted file mode 100644 index d0876a7..0000000 --- a/.idea/paper-heaps-law-llm.iml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file From 56974355307aaa517ba482b6e662c6eb2fdc6c2e Mon Sep 17 00:00:00 2001 From: Paul Sheridan Date: Fri, 24 Nov 2023 15:34:15 -0400 Subject: [PATCH 25/25] copy edits --- README.md | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 79b137b..a7a21ab 100644 --- a/README.md +++ b/README.md @@ -67,17 +67,14 @@ python decode.py This script applies the same preprocesing strageries as used above. -### Heaps' Law Estimation +### Heaps' Law Estimation and Visualization -To estimate the Heaps' law parameters for each GPT-Neo model using simple linear regression, navigate to each folder `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b` and run +To prepare the emulated texts for analysis, navigate to each folder `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b` and run ``` python heapsLaw.py ``` -The parameter estimates are found in Table 1 of the paper. -### Heaps' Law Visualization - -To generate the plots of Figure 1 in the paper, run +To generate the plots of Figure 1 and Heaps' law parameter estimates of Table 1, run ``` python drawThePlotAndEstimation.py ```