From c0ba4f882f984f5c69e049af291cb44f234739d8 Mon Sep 17 00:00:00 2001
From: paul-sheridan <paul.sheridan.stats@gmail.com>
Date: Tue, 24 Oct 2023 10:55:37 -0300
Subject: [PATCH 01/25] added title

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8a9fb9e..4a99f5f 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,3 @@
-# paper-heaps-law-llm
+# Heaps' Law in GPT-Neo Large Language Model Emulated Corpora
+
 Official repository for the workshop paper Heaps' Law in GPT-Neo Large Language Model Emulated Corpora

From 5b16101043bd24b713b15b759d2b68bc0e9c11f4 Mon Sep 17 00:00:00 2001
From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com>
Date: Fri, 3 Nov 2023 16:21:20 -0300
Subject: [PATCH 02/25] first commit

commit the first version of the code
---
 gitHubOfficalCode/.idea/.gitignore            |   3 +
 gitHubOfficalCode/.idea/gitHubOfficalCode.iml |   8 +
 .../inspectionProfiles/Project_Default.xml    | 157 ++++++++++++++++++
 .../inspectionProfiles/profiles_settings.xml  |   6 +
 gitHubOfficalCode/.idea/misc.xml              |   4 +
 gitHubOfficalCode/.idea/modules.xml           |   8 +
 gitHubOfficalCode/DataSelection.py            |  31 ++++
 gitHubOfficalCode/DrawThePlotAndEstimation.py |  57 +++++++
 gitHubOfficalCode/cleanData.py                |  23 +++
 gitHubOfficalCode/data/test.jsonl             |   3 +
 .../dataGenaration/gpt-neo-1.3b/decode.py     |  39 +++++
 .../gpt-neo-1.3b/generate_python_scripts.sh   | 105 ++++++++++++
 .../gpt-neo-1.3b/generate_slurm_scripts.sh    |  22 +++
 .../gpt-neo-1.3b/submit_all_jobs.sh           |  12 ++
 .../dataGenaration/gpt-neo-125m/decode.py     |  39 +++++
 .../gpt-neo-125m/generate_python_scripts.sh   | 105 ++++++++++++
 .../gpt-neo-125m/generate_slurm_scripts.sh    |  22 +++
 .../gpt-neo-125m/submit_all_jobs.sh           |  12 ++
 .../dataGenaration/gpt-neo-2.7b/decode.py     |  39 +++++
 .../gpt-neo-2.7b/generate_python_scripts.sh   | 105 ++++++++++++
 .../gpt-neo-2.7b/generate_slurm_scripts.sh    |  22 +++
 .../gpt-neo-2.7b/submit_all_jobs.sh           |  12 ++
 gitHubOfficalCode/heaplaw.py                  |  39 +++++
 gitHubOfficalCode/processData.py              |  14 ++
 gitHubOfficalCode/promtSelection.py           |  48 ++++++
 25 files changed, 935 insertions(+)
 create mode 100644 gitHubOfficalCode/.idea/.gitignore
 create mode 100644 gitHubOfficalCode/.idea/gitHubOfficalCode.iml
 create mode 100644 gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml
 create mode 100644 gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml
 create mode 100644 gitHubOfficalCode/.idea/misc.xml
 create mode 100644 gitHubOfficalCode/.idea/modules.xml
 create mode 100644 gitHubOfficalCode/DataSelection.py
 create mode 100644 gitHubOfficalCode/DrawThePlotAndEstimation.py
 create mode 100644 gitHubOfficalCode/cleanData.py
 create mode 100644 gitHubOfficalCode/data/test.jsonl
 create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/decode.py
 create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh
 create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh
 create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh
 create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-125m/decode.py
 create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_python_scripts.sh
 create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh
 create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-125m/submit_all_jobs.sh
 create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/decode.py
 create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh
 create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh
 create mode 100644 gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh
 create mode 100644 gitHubOfficalCode/heaplaw.py
 create mode 100644 gitHubOfficalCode/processData.py
 create mode 100644 gitHubOfficalCode/promtSelection.py

diff --git a/gitHubOfficalCode/.idea/.gitignore b/gitHubOfficalCode/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/gitHubOfficalCode/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/gitHubOfficalCode/.idea/gitHubOfficalCode.iml b/gitHubOfficalCode/.idea/gitHubOfficalCode.iml
new file mode 100644
index 0000000..d0876a7
--- /dev/null
+++ b/gitHubOfficalCode/.idea/gitHubOfficalCode.iml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml b/gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..4ea2cf8
--- /dev/null
+++ b/gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,157 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
\ No newline at end of file
diff --git a/gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml b/gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/gitHubOfficalCode/.idea/misc.xml b/gitHubOfficalCode/.idea/misc.xml
new file mode 100644
index 0000000..a971a2c
--- /dev/null
+++ b/gitHubOfficalCode/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/gitHubOfficalCode/.idea/modules.xml b/gitHubOfficalCode/.idea/modules.xml
new file mode 100644
index 0000000..c7b47ab
--- /dev/null
+++ b/gitHubOfficalCode/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/gitHubOfficalCode.iml" filepath="$PROJECT_DIR$/.idea/gitHubOfficalCode.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/gitHubOfficalCode/DataSelection.py b/gitHubOfficalCode/DataSelection.py
new file mode 100644
index 0000000..8152087
--- /dev/null
+++ b/gitHubOfficalCode/DataSelection.py
@@ -0,0 +1,31 @@
+import json
+import pickle
+from processData import process_data
+
+def main():
+    file_path = 'data/test.jsonl'
+    processed_data = []
+    limit = 500000
+    prompt_length = 5
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                data_chunk = json.loads(line).get('text', '')  # Assuming each line is a JSON object with an 'abstract' field
+                processed_line = process_data(data_chunk)
+                if len(processed_line) > prompt_length:  # Check if the processed line has more than 10 words
+                    processed_data.append(processed_line)
+                    if len(processed_data) >= limit:
+                        break
+        with open('processData.pickle', 'wb') as f:
+            pickle.dump(processed_data, f)
+        print("Data has been processed and saved to 'processData.pickle'")
+    except FileNotFoundError:
+        print("The file was not found. Please check the file path.")
+    except json.JSONDecodeError:
+        print("Error decoding JSON. Please check the file content.")
+    except Exception as e:
+        print("An error occurred:", str(e))
+
+if __name__ == "__main__":
+    main()
+
diff --git a/gitHubOfficalCode/DrawThePlotAndEstimation.py b/gitHubOfficalCode/DrawThePlotAndEstimation.py
new file mode 100644
index 0000000..6e89451
--- /dev/null
+++ b/gitHubOfficalCode/DrawThePlotAndEstimation.py
@@ -0,0 +1,57 @@
+import math
+import pickle
+import numpy as np
+from matplotlib import pyplot as plt
+
+class pubMed:
+    def __init__(self):
+        self.data = {}
+
+    def loadDocument(self, filename, label):
+        with open(filename, 'rb') as f:
+            xy = pickle.load(f)
+            x, y = [], []
+            for xAndy in xy:
+                x.append(xAndy[0])
+                y.append(xAndy[1])
+                # x.append(math.log10(xAndy[0]))
+                # y.append(math.log10(xAndy[1]))
+            self.data[label] = (x, y)
+
+    def DrawHeapLaw(self, step):
+        colors = ['r', 'g', 'b']
+        plt.ylabel('Vocabulary Size')
+        plt.xlabel('Collection Size')
+        plt.title("Heaps' law")
+
+        for idx, (label, (x, y)) in enumerate(self.data.items()):
+            step -= 1
+            i = 0
+            print(x)
+            filtered_x, filtered_y = [], []
+            while i < len(x):
+
+                filtered_x.append(x[i])
+                filtered_y.append(y[i])
+                i += step
+            plt.plot(filtered_x, filtered_y, color=colors[idx], label=label)
+            beta  ,  logk = np.polyfit(filtered_x, filtered_y, 1)
+            print(f"Slope for {label}: {beta}")
+            print(f"Slope for {label}: {10**logk}")
+
+
+
+
+        plt.legend()
+        plt.grid(True)
+        plt.savefig('-loglog.pdf', transparent=True)
+        plt.show()
+
+
+onehu = pubMed()
+files = [ 'heapLawData-selectedPromt.pkl' , "heaplaw125m.pkl","heaplaw1.3b.pkl"]
+labels = ["PubMed","125m","1.3b" ]
+for file, label in zip(files, labels):
+    onehu.loadDocument(file, label)
+
+onehu.DrawHeapLaw(10)
diff --git a/gitHubOfficalCode/cleanData.py b/gitHubOfficalCode/cleanData.py
new file mode 100644
index 0000000..9dd7e60
--- /dev/null
+++ b/gitHubOfficalCode/cleanData.py
@@ -0,0 +1,23 @@
+import pickle
+import random
+# Make sure processData.py is in the same directory or in the PYTHONPATH
+from processData import process_data
+
+# Path to the original and new pickle files
+input_path = 'dataGenaration/result/result-gptNeo2.7b.pkl'
+output_path = 'data/proccessData2.7.pkl'
+
+# Load the original pickle file
+with open(input_path, 'rb') as file:
+    original_data = pickle.load(file)
+print(original_data)
+random.shuffle(original_data)
+# Process the original data using the imported function
+processed_data = [process_data(item) for item in original_data]
+
+# Now, we will save the processed data as an array of arrays
+with open(output_path, 'wb') as outfile:
+    pickle.dump(processed_data, outfile)
+
+print("Data processing complete and saved to proccessData125m.pkl")
+
diff --git a/gitHubOfficalCode/data/test.jsonl b/gitHubOfficalCode/data/test.jsonl
new file mode 100644
index 0000000..94b381f
--- /dev/null
+++ b/gitHubOfficalCode/data/test.jsonl
@@ -0,0 +1,3 @@
+{"meta": {"pmid": 11409574, "language": "eng"}, "text": " in children with ARI and relative risks for the association "}
+{"meta": {"pmid": 11409574, "language": "eng"}, "text": " in children with ARI and relative risks for the association "}
+{"meta": {"pmid": 11409574, "language": "eng"}, "text": " in children with ARI and relative risks for the association "}
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/decode.py b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/decode.py
new file mode 100644
index 0000000..b557b0c
--- /dev/null
+++ b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/decode.py
@@ -0,0 +1,39 @@
+import pickle
+from transformers import GPT2Tokenizer
+
+def decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file):
+    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+
+    all_decoded_texts = []
+
+    for i in range(start_idx, end_idx + 1):
+        input_file = f"{input_folder}/gpt-neo-125m-{i}.pkl"
+        print(f"Processing {input_file}...")
+
+        # Load the generated text (as token IDs)
+        try:
+            with open(input_file, 'rb') as f:
+                generated_text_ids = pickle.load(f)
+
+            # Decode the text
+            decoded_texts = [tokenizer.decode(text_id, skip_special_tokens=True) for text_id in generated_text_ids]
+            all_decoded_texts.extend(decoded_texts)
+        except FileNotFoundError:
+            print(f"File not found: {input_file}")
+        except Exception as e:
+            print(f"An error occurred while processing {input_file}: {str(e)}")
+
+    # Save all decoded text to a file
+    with open(output_file, 'wb') as f:
+        pickle.dump(all_decoded_texts, f)
+
+    print(f"Decoded texts saved to {output_file}")
+
+if __name__ == '__main__':
+    model_name = 'EleutherAI/gpt-neo-125M'
+    start_idx = 2
+    end_idx = 20
+    input_folder = 'dataGenaration/gpt-neo-125m'
+    output_file = 'dataGenaration/result/result-gptNeo125m.pkl'
+    decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file)
+
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh
new file mode 100644
index 0000000..af0d50e
--- /dev/null
+++ b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh
@@ -0,0 +1,105 @@
+
+# Number of scripts you want to create
+num_scripts=20
+step=25000
+
+for i in $(seq 1 $num_scripts); do
+    start_point=$(( ($i-1) * $step ))
+    end_point=$(( $i * $step ))
+    file_name="gpt-neo-125m-$i.py"
+    output_pickle_file="gpt-neo-125m-$i.pkl"
+
+    # Generate the Python script
+    cat > $file_name <<EOF
+import torch
+import pickle
+from torch.utils.data import Dataset, DataLoader
+from transformers import GPTNeoForCausalLM, GPT2Tokenizer
+
+class VarrianDataset(Dataset):
+    def __init__(self, input_ids, lengths):
+        self.input_ids = input_ids
+        self.lengths = lengths
+
+    def __len__(self):
+        return len(self.input_ids)
+
+    def __getitem__(self, idx):
+        return self.input_ids[idx], self.lengths[idx]
+
+class LLMsGeneration:
+    def __init__(self, model, tokenizer, device, start_point, end_point):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.startPoint = start_point
+        self.endPoint = end_point
+        self.rawDoc = None
+
+    def loadArray(self, prompt):
+        with open(prompt, 'rb') as f:
+            data = pickle.load(f)[self.startPoint:self.endPoint]
+
+        prompts, lengths = zip(*data)
+        prompts = [" ".join(prompt) for prompt in prompts]
+
+        all_input_ids = self.tokenizer(prompts, return_tensors="pt", truncation=True, padding="max_length", max_length=2048).input_ids
+        all_input_ids = all_input_ids.to(self.device)
+
+        dataset = VarrianDataset(all_input_ids, lengths)
+        dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
+
+        outputs = []
+        max_position_embeddings = self.model.config.max_position_embeddings
+        
+        for batch_input_ids, batch_lengths in dataloader:
+            if batch_input_ids.shape[1] > max_position_embeddings:
+                raise ValueError(f"Input IDs length ({batch_input_ids.shape[1]}) exceeds model's max position embeddings ({max_position_embeddings}).")
+
+            max_desired_length = max(batch_lengths).item()
+            total_length = batch_input_ids.shape[1] + max_desired_length
+
+            if total_length > max_position_embeddings:
+                print(f"Warning: Desired total length ({total_length}) exceeds model's max position embeddings ({max_position_embeddings}). Truncating to {max_position_embeddings}.")
+                total_length = max_position_embeddings
+
+            generated = self.model.generate(batch_input_ids,
+                                            max_length=total_length,
+                                            pad_token_id=self.tokenizer.pad_token_id)
+
+            outputs.extend(generated.cpu().numpy())
+
+        self.rawDoc = outputs
+
+    def decode(self, data):
+        return self.tokenizer.decode(data)
+
+
+if __name__ == '__main__':
+    # Set up start and end points, model, tokenizer, and device
+    start_point = $start_point
+    end_point = $end_point
+    output_pickle_file = '$output_pickle_file'
+    model_name = 'EleutherAI/gpt-neo-125m'
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Initialize model and tokenizer
+    model = GPTNeoForCausalLM.from_pretrained(model_name).to(device)
+    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+
+    # Set padding token
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Initialize LLMsGeneration instance
+    llms_generation = LLMsGeneration(model, tokenizer, device, start_point, end_point)
+    
+    # Load array and generate text
+    llms_generation.loadArray('data/promptSelection.pickle')
+    
+    # Save the generated text
+    with open(output_pickle_file, 'wb') as file:
+        pickle.dump(llms_generation.rawDoc, file)
+
+EOF
+
+done
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh
new file mode 100644
index 0000000..fa214e9
--- /dev/null
+++ b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+for i in {1..20}
+do
+    output_file="slurm-job-${i}.sh"
+    echo "Creating $output_file..."
+
+    cat <<EOT > $output_file
+#!/bin/bash
+#SBATCH --account=your_account_name
+#SBATCH --gpus-per-node=a100:1
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=3
+#SBATCH --mem=70G
+#SBATCH --time=0-15:00
+module load python/3.11.2
+module load StdEnv/2020
+python gpt-neo-125m-${i}.py
+
+EOT
+done
+
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh
new file mode 100644
index 0000000..3f08cd4
--- /dev/null
+++ b/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+# Loop through the generated slurm job scripts and submit them
+for i in {2..20}
+do
+    slurm_file="slurm-job-${i}.sh"
+    echo "Submitting $slurm_file..."
+    sbatch $slurm_file
+done
+
+echo "All jobs submitted."
+
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/decode.py b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/decode.py
new file mode 100644
index 0000000..b557b0c
--- /dev/null
+++ b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/decode.py
@@ -0,0 +1,39 @@
+import pickle
+from transformers import GPT2Tokenizer
+
+def decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file):
+    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+
+    all_decoded_texts = []
+
+    for i in range(start_idx, end_idx + 1):
+        input_file = f"{input_folder}/gpt-neo-125m-{i}.pkl"
+        print(f"Processing {input_file}...")
+
+        # Load the generated text (as token IDs)
+        try:
+            with open(input_file, 'rb') as f:
+                generated_text_ids = pickle.load(f)
+
+            # Decode the text
+            decoded_texts = [tokenizer.decode(text_id, skip_special_tokens=True) for text_id in generated_text_ids]
+            all_decoded_texts.extend(decoded_texts)
+        except FileNotFoundError:
+            print(f"File not found: {input_file}")
+        except Exception as e:
+            print(f"An error occurred while processing {input_file}: {str(e)}")
+
+    # Save all decoded text to a file
+    with open(output_file, 'wb') as f:
+        pickle.dump(all_decoded_texts, f)
+
+    print(f"Decoded texts saved to {output_file}")
+
+if __name__ == '__main__':
+    model_name = 'EleutherAI/gpt-neo-125M'
+    start_idx = 2
+    end_idx = 20
+    input_folder = 'dataGenaration/gpt-neo-125m'
+    output_file = 'dataGenaration/result/result-gptNeo125m.pkl'
+    decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file)
+
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_python_scripts.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_python_scripts.sh
new file mode 100644
index 0000000..af0d50e
--- /dev/null
+++ b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_python_scripts.sh
@@ -0,0 +1,105 @@
+
+# Number of scripts you want to create
+num_scripts=20
+step=25000
+
+for i in $(seq 1 $num_scripts); do
+    start_point=$(( ($i-1) * $step ))
+    end_point=$(( $i * $step ))
+    file_name="gpt-neo-125m-$i.py"
+    output_pickle_file="gpt-neo-125m-$i.pkl"
+
+    # Generate the Python script
+    cat > $file_name <<EOF
+import torch
+import pickle
+from torch.utils.data import Dataset, DataLoader
+from transformers import GPTNeoForCausalLM, GPT2Tokenizer
+
+class VarrianDataset(Dataset):
+    def __init__(self, input_ids, lengths):
+        self.input_ids = input_ids
+        self.lengths = lengths
+
+    def __len__(self):
+        return len(self.input_ids)
+
+    def __getitem__(self, idx):
+        return self.input_ids[idx], self.lengths[idx]
+
+class LLMsGeneration:
+    def __init__(self, model, tokenizer, device, start_point, end_point):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.startPoint = start_point
+        self.endPoint = end_point
+        self.rawDoc = None
+
+    def loadArray(self, prompt):
+        with open(prompt, 'rb') as f:
+            data = pickle.load(f)[self.startPoint:self.endPoint]
+
+        prompts, lengths = zip(*data)
+        prompts = [" ".join(prompt) for prompt in prompts]
+
+        all_input_ids = self.tokenizer(prompts, return_tensors="pt", truncation=True, padding="max_length", max_length=2048).input_ids
+        all_input_ids = all_input_ids.to(self.device)
+
+        dataset = VarrianDataset(all_input_ids, lengths)
+        dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
+
+        outputs = []
+        max_position_embeddings = self.model.config.max_position_embeddings
+        
+        for batch_input_ids, batch_lengths in dataloader:
+            if batch_input_ids.shape[1] > max_position_embeddings:
+                raise ValueError(f"Input IDs length ({batch_input_ids.shape[1]}) exceeds model's max position embeddings ({max_position_embeddings}).")
+
+            max_desired_length = max(batch_lengths).item()
+            total_length = batch_input_ids.shape[1] + max_desired_length
+
+            if total_length > max_position_embeddings:
+                print(f"Warning: Desired total length ({total_length}) exceeds model's max position embeddings ({max_position_embeddings}). Truncating to {max_position_embeddings}.")
+                total_length = max_position_embeddings
+
+            generated = self.model.generate(batch_input_ids,
+                                            max_length=total_length,
+                                            pad_token_id=self.tokenizer.pad_token_id)
+
+            outputs.extend(generated.cpu().numpy())
+
+        self.rawDoc = outputs
+
+    def decode(self, data):
+        return self.tokenizer.decode(data)
+
+
+if __name__ == '__main__':
+    # Set up start and end points, model, tokenizer, and device
+    start_point = $start_point
+    end_point = $end_point
+    output_pickle_file = '$output_pickle_file'
+    model_name = 'EleutherAI/gpt-neo-125m'
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Initialize model and tokenizer
+    model = GPTNeoForCausalLM.from_pretrained(model_name).to(device)
+    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+
+    # Set padding token
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Initialize LLMsGeneration instance
+    llms_generation = LLMsGeneration(model, tokenizer, device, start_point, end_point)
+    
+    # Load array and generate text
+    llms_generation.loadArray('data/promptSelection.pickle')
+    
+    # Save the generated text
+    with open(output_pickle_file, 'wb') as file:
+        pickle.dump(llms_generation.rawDoc, file)
+
+EOF
+
+done
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh
new file mode 100644
index 0000000..fa214e9
--- /dev/null
+++ b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+for i in {1..20}
+do
+    output_file="slurm-job-${i}.sh"
+    echo "Creating $output_file..."
+
+    cat <<EOT > $output_file
+#!/bin/bash
+#SBATCH --account=your_account_name
+#SBATCH --gpus-per-node=a100:1
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=3
+#SBATCH --mem=70G
+#SBATCH --time=0-15:00
+module load python/3.11.2
+module load StdEnv/2020
+python gpt-neo-125m-${i}.py
+
+EOT
+done
+
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/submit_all_jobs.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/submit_all_jobs.sh
new file mode 100644
index 0000000..3f08cd4
--- /dev/null
+++ b/gitHubOfficalCode/dataGenaration/gpt-neo-125m/submit_all_jobs.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+# Loop through the generated slurm job scripts and submit them
+for i in {2..20}
+do
+    slurm_file="slurm-job-${i}.sh"
+    echo "Submitting $slurm_file..."
+    sbatch $slurm_file
+done
+
+echo "All jobs submitted."
+
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/decode.py b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/decode.py
new file mode 100644
index 0000000..b557b0c
--- /dev/null
+++ b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/decode.py
@@ -0,0 +1,39 @@
+import pickle
+from transformers import GPT2Tokenizer
+
+def decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file):
+    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+
+    all_decoded_texts = []
+
+    for i in range(start_idx, end_idx + 1):
+        input_file = f"{input_folder}/gpt-neo-125m-{i}.pkl"
+        print(f"Processing {input_file}...")
+
+        # Load the generated text (as token IDs)
+        try:
+            with open(input_file, 'rb') as f:
+                generated_text_ids = pickle.load(f)
+
+            # Decode the text
+            decoded_texts = [tokenizer.decode(text_id, skip_special_tokens=True) for text_id in generated_text_ids]
+            all_decoded_texts.extend(decoded_texts)
+        except FileNotFoundError:
+            print(f"File not found: {input_file}")
+        except Exception as e:
+            print(f"An error occurred while processing {input_file}: {str(e)}")
+
+    # Save all decoded text to a file
+    with open(output_file, 'wb') as f:
+        pickle.dump(all_decoded_texts, f)
+
+    print(f"Decoded texts saved to {output_file}")
+
+if __name__ == '__main__':
+    model_name = 'EleutherAI/gpt-neo-125M'
+    start_idx = 2
+    end_idx = 20
+    input_folder = 'dataGenaration/gpt-neo-125m'
+    output_file = 'dataGenaration/result/result-gptNeo125m.pkl'
+    decode_generated_text(model_name, start_idx, end_idx, input_folder, output_file)
+
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh
new file mode 100644
index 0000000..af0d50e
--- /dev/null
+++ b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh
@@ -0,0 +1,105 @@
+
+# Number of scripts you want to create
+num_scripts=20
+step=25000
+
+for i in $(seq 1 $num_scripts); do
+    start_point=$(( ($i-1) * $step ))
+    end_point=$(( $i * $step ))
+    file_name="gpt-neo-125m-$i.py"
+    output_pickle_file="gpt-neo-125m-$i.pkl"
+
+    # Generate the Python script
+    cat > $file_name <<EOF
+import torch
+import pickle
+from torch.utils.data import Dataset, DataLoader
+from transformers import GPTNeoForCausalLM, GPT2Tokenizer
+
+class VarrianDataset(Dataset):
+    def __init__(self, input_ids, lengths):
+        self.input_ids = input_ids
+        self.lengths = lengths
+
+    def __len__(self):
+        return len(self.input_ids)
+
+    def __getitem__(self, idx):
+        return self.input_ids[idx], self.lengths[idx]
+
+class LLMsGeneration:
+    def __init__(self, model, tokenizer, device, start_point, end_point):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.startPoint = start_point
+        self.endPoint = end_point
+        self.rawDoc = None
+
+    def loadArray(self, prompt):
+        with open(prompt, 'rb') as f:
+            data = pickle.load(f)[self.startPoint:self.endPoint]
+
+        prompts, lengths = zip(*data)
+        prompts = [" ".join(prompt) for prompt in prompts]
+
+        all_input_ids = self.tokenizer(prompts, return_tensors="pt", truncation=True, padding="max_length", max_length=2048).input_ids
+        all_input_ids = all_input_ids.to(self.device)
+
+        dataset = VarrianDataset(all_input_ids, lengths)
+        dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
+
+        outputs = []
+        max_position_embeddings = self.model.config.max_position_embeddings
+        
+        for batch_input_ids, batch_lengths in dataloader:
+            if batch_input_ids.shape[1] > max_position_embeddings:
+                raise ValueError(f"Input IDs length ({batch_input_ids.shape[1]}) exceeds model's max position embeddings ({max_position_embeddings}).")
+
+            max_desired_length = max(batch_lengths).item()
+            total_length = batch_input_ids.shape[1] + max_desired_length
+
+            if total_length > max_position_embeddings:
+                print(f"Warning: Desired total length ({total_length}) exceeds model's max position embeddings ({max_position_embeddings}). Truncating to {max_position_embeddings}.")
+                total_length = max_position_embeddings
+
+            generated = self.model.generate(batch_input_ids,
+                                            max_length=total_length,
+                                            pad_token_id=self.tokenizer.pad_token_id)
+
+            outputs.extend(generated.cpu().numpy())
+
+        self.rawDoc = outputs
+
+    def decode(self, data):
+        return self.tokenizer.decode(data)
+
+
+if __name__ == '__main__':
+    # Set up start and end points, model, tokenizer, and device
+    start_point = $start_point
+    end_point = $end_point
+    output_pickle_file = '$output_pickle_file'
+    model_name = 'EleutherAI/gpt-neo-125m'
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Initialize model and tokenizer
+    model = GPTNeoForCausalLM.from_pretrained(model_name).to(device)
+    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+
+    # Set padding token
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Initialize LLMsGeneration instance
+    llms_generation = LLMsGeneration(model, tokenizer, device, start_point, end_point)
+    
+    # Load array and generate text
+    llms_generation.loadArray('data/promptSelection.pickle')
+    
+    # Save the generated text
+    with open(output_pickle_file, 'wb') as file:
+        pickle.dump(llms_generation.rawDoc, file)
+
+EOF
+
+done
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh
new file mode 100644
index 0000000..fa214e9
--- /dev/null
+++ b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+for i in {1..20}
+do
+    output_file="slurm-job-${i}.sh"
+    echo "Creating $output_file..."
+
+    cat <<EOT > $output_file
+#!/bin/bash
+#SBATCH --account=your_account_name
+#SBATCH --gpus-per-node=a100:1
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=3
+#SBATCH --mem=70G
+#SBATCH --time=0-15:00
+module load python/3.11.2
+module load StdEnv/2020
+python gpt-neo-125m-${i}.py
+
+EOT
+done
+
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh
new file mode 100644
index 0000000..3f08cd4
--- /dev/null
+++ b/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+# Loop through the generated slurm job scripts and submit them
+for i in {2..20}
+do
+    slurm_file="slurm-job-${i}.sh"
+    echo "Submitting $slurm_file..."
+    sbatch $slurm_file
+done
+
+echo "All jobs submitted."
+
diff --git a/gitHubOfficalCode/heaplaw.py b/gitHubOfficalCode/heaplaw.py
new file mode 100644
index 0000000..ed43c50
--- /dev/null
+++ b/gitHubOfficalCode/heaplaw.py
@@ -0,0 +1,39 @@
+import pickle
+import random
+
+def process_data(input_file):
+    # Initialize counters
+    overall_total_words = 0
+    overall_unique_words = set()
+
+    with open(input_file, 'rb') as f:
+        data = pickle.load(f)
+
+    # Shuffle the main data array
+    random.shuffle(data)
+
+    # Initialize the result array
+    result = []
+
+    for word_array in data:
+        # Update the overall counters
+        overall_total_words += len(word_array)
+        overall_unique_words.update(word_array)
+
+        # Append the current count and unique count to result
+        result.append([overall_total_words, len(overall_unique_words)])
+
+    return result
+
+if __name__ == '__main__':
+    input_file = 'data/proccessData1.3.pkl'
+    output_file = 'dataGenaration/result/heaplaw1.3b.pkl'
+
+    result = process_data(input_file)
+
+    # Save the result array to a file
+    with open(output_file, 'wb') as f:
+        pickle.dump(result, f)
+
+    print(f"Result saved to {output_file}")
+
diff --git a/gitHubOfficalCode/processData.py b/gitHubOfficalCode/processData.py
new file mode 100644
index 0000000..361cfd3
--- /dev/null
+++ b/gitHubOfficalCode/processData.py
@@ -0,0 +1,14 @@
+import re
+import unicodedata
+
+def process_data(data):
+    # Normalize data
+    normalized_data = unicodedata.normalize('NFKD', data).encode('ascii', 'ignore').decode('utf-8', 'ignore')
+    # Convert to lowercase
+    normalized_data = normalized_data.lower()
+    # Remove punctuation
+    normalized_data = re.sub(r'[^\w\s]', '', normalized_data).strip()
+    # Tokenize
+    tokens = normalized_data.split()
+    return tokens
+
diff --git a/gitHubOfficalCode/promtSelection.py b/gitHubOfficalCode/promtSelection.py
new file mode 100644
index 0000000..3623d54
--- /dev/null
+++ b/gitHubOfficalCode/promtSelection.py
@@ -0,0 +1,48 @@
+from transformers import GPT2Tokenizer
+import pickle
+
+def create_prompts(data, prompt_length=5, tokenizer=None):
+    prompts = []
+    for item in data:
+        if tokenizer is None:
+            raise ValueError("Tokenizer must be provided")
+        
+        tokenized_item = tokenizer(item, add_special_tokens=False)
+        tokenized_length = len(tokenized_item['input_ids'])
+        
+        if tokenized_length >= prompt_length:
+            prompt = item[:prompt_length]
+        else:
+            prompt = item
+        
+        prompts.append((prompt, tokenized_length))
+    return prompts
+
+def main():
+    data_folder = '/data'
+    processData_file = f'{data_folder}/processData.pickle'
+    promptSelection_file = f'{data_folder}/promptSelection.pickle'
+    model_name = 'EleutherAI/gpt-neo-125m'
+    
+    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    
+    # Step 1: Read Data
+    with open(processData_file, 'rb') as f:
+        processed_data = pickle.load(f)
+
+    # Select only the first 2 items for quick testing
+
+    # Step 2: Create Prompts and Store Tokenized Length
+    prompts = create_prompts(processed_data, 10, tokenizer)
+
+    # Step 3: Save Prompt Data to File
+    with open(promptSelection_file, 'wb') as f:
+        pickle.dump(prompts, f)
+
+    print(f"Prompt data has been saved to '{promptSelection_file}'")
+
+
+if __name__ == "__main__":
+    main()
+

From 7cf39574f00efb9b11e67bd48ad697ba4a82e40b Mon Sep 17 00:00:00 2001
From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com>
Date: Fri, 3 Nov 2023 16:22:46 -0300
Subject: [PATCH 03/25] delete unrealatething

---
 .../DataSelection.py => DataSelection.py      |   0
 ...timation.py => DrawThePlotAndEstimation.py |   0
 .../cleanData.py => cleanData.py              |   0
 {gitHubOfficalCode/data => data}/test.jsonl   |   0
 .../gpt-neo-1.3b/decode.py                    |   0
 .../gpt-neo-1.3b/generate_python_scripts.sh   |   0
 .../gpt-neo-1.3b/generate_slurm_scripts.sh    |   0
 .../gpt-neo-1.3b/submit_all_jobs.sh           |   0
 .../gpt-neo-125m/decode.py                    |   0
 .../gpt-neo-125m/generate_python_scripts.sh   |   0
 .../gpt-neo-125m/generate_slurm_scripts.sh    |   0
 .../gpt-neo-125m/submit_all_jobs.sh           |   0
 .../gpt-neo-2.7b/decode.py                    |   0
 .../gpt-neo-2.7b/generate_python_scripts.sh   |   0
 .../gpt-neo-2.7b/generate_slurm_scripts.sh    |   0
 .../gpt-neo-2.7b/submit_all_jobs.sh           |   0
 gitHubOfficalCode/.idea/.gitignore            |   3 -
 gitHubOfficalCode/.idea/gitHubOfficalCode.iml |   8 -
 .../inspectionProfiles/Project_Default.xml    | 157 ------------------
 .../inspectionProfiles/profiles_settings.xml  |   6 -
 gitHubOfficalCode/.idea/misc.xml              |   4 -
 gitHubOfficalCode/.idea/modules.xml           |   8 -
 gitHubOfficalCode/heaplaw.py => heaplaw.py    |   0
 .../processData.py => processData.py          |   0
 .../promtSelection.py => promtSelection.py    |   0
 25 files changed, 186 deletions(-)
 rename gitHubOfficalCode/DataSelection.py => DataSelection.py (100%)
 rename gitHubOfficalCode/DrawThePlotAndEstimation.py => DrawThePlotAndEstimation.py (100%)
 rename gitHubOfficalCode/cleanData.py => cleanData.py (100%)
 rename {gitHubOfficalCode/data => data}/test.jsonl (100%)
 rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-1.3b/decode.py (100%)
 rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-1.3b/generate_python_scripts.sh (100%)
 rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-1.3b/generate_slurm_scripts.sh (100%)
 rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-1.3b/submit_all_jobs.sh (100%)
 rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-125m/decode.py (100%)
 rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-125m/generate_python_scripts.sh (100%)
 rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-125m/generate_slurm_scripts.sh (100%)
 rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-125m/submit_all_jobs.sh (100%)
 rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-2.7b/decode.py (100%)
 rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-2.7b/generate_python_scripts.sh (100%)
 rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-2.7b/generate_slurm_scripts.sh (100%)
 rename {gitHubOfficalCode/dataGenaration => dataGenaration}/gpt-neo-2.7b/submit_all_jobs.sh (100%)
 delete mode 100644 gitHubOfficalCode/.idea/.gitignore
 delete mode 100644 gitHubOfficalCode/.idea/gitHubOfficalCode.iml
 delete mode 100644 gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml
 delete mode 100644 gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml
 delete mode 100644 gitHubOfficalCode/.idea/misc.xml
 delete mode 100644 gitHubOfficalCode/.idea/modules.xml
 rename gitHubOfficalCode/heaplaw.py => heaplaw.py (100%)
 rename gitHubOfficalCode/processData.py => processData.py (100%)
 rename gitHubOfficalCode/promtSelection.py => promtSelection.py (100%)

diff --git a/gitHubOfficalCode/DataSelection.py b/DataSelection.py
similarity index 100%
rename from gitHubOfficalCode/DataSelection.py
rename to DataSelection.py
diff --git a/gitHubOfficalCode/DrawThePlotAndEstimation.py b/DrawThePlotAndEstimation.py
similarity index 100%
rename from gitHubOfficalCode/DrawThePlotAndEstimation.py
rename to DrawThePlotAndEstimation.py
diff --git a/gitHubOfficalCode/cleanData.py b/cleanData.py
similarity index 100%
rename from gitHubOfficalCode/cleanData.py
rename to cleanData.py
diff --git a/gitHubOfficalCode/data/test.jsonl b/data/test.jsonl
similarity index 100%
rename from gitHubOfficalCode/data/test.jsonl
rename to data/test.jsonl
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/decode.py b/dataGenaration/gpt-neo-1.3b/decode.py
similarity index 100%
rename from gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/decode.py
rename to dataGenaration/gpt-neo-1.3b/decode.py
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh b/dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh
similarity index 100%
rename from gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh
rename to dataGenaration/gpt-neo-1.3b/generate_python_scripts.sh
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh b/dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh
similarity index 100%
rename from gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh
rename to dataGenaration/gpt-neo-1.3b/generate_slurm_scripts.sh
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh b/dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh
similarity index 100%
rename from gitHubOfficalCode/dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh
rename to dataGenaration/gpt-neo-1.3b/submit_all_jobs.sh
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/decode.py b/dataGenaration/gpt-neo-125m/decode.py
similarity index 100%
rename from gitHubOfficalCode/dataGenaration/gpt-neo-125m/decode.py
rename to dataGenaration/gpt-neo-125m/decode.py
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_python_scripts.sh b/dataGenaration/gpt-neo-125m/generate_python_scripts.sh
similarity index 100%
rename from gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_python_scripts.sh
rename to dataGenaration/gpt-neo-125m/generate_python_scripts.sh
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh b/dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh
similarity index 100%
rename from gitHubOfficalCode/dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh
rename to dataGenaration/gpt-neo-125m/generate_slurm_scripts.sh
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-125m/submit_all_jobs.sh b/dataGenaration/gpt-neo-125m/submit_all_jobs.sh
similarity index 100%
rename from gitHubOfficalCode/dataGenaration/gpt-neo-125m/submit_all_jobs.sh
rename to dataGenaration/gpt-neo-125m/submit_all_jobs.sh
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/decode.py b/dataGenaration/gpt-neo-2.7b/decode.py
similarity index 100%
rename from gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/decode.py
rename to dataGenaration/gpt-neo-2.7b/decode.py
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh b/dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh
similarity index 100%
rename from gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh
rename to dataGenaration/gpt-neo-2.7b/generate_python_scripts.sh
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh b/dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh
similarity index 100%
rename from gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh
rename to dataGenaration/gpt-neo-2.7b/generate_slurm_scripts.sh
diff --git a/gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh b/dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh
similarity index 100%
rename from gitHubOfficalCode/dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh
rename to dataGenaration/gpt-neo-2.7b/submit_all_jobs.sh
diff --git a/gitHubOfficalCode/.idea/.gitignore b/gitHubOfficalCode/.idea/.gitignore
deleted file mode 100644
index 26d3352..0000000
--- a/gitHubOfficalCode/.idea/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
diff --git a/gitHubOfficalCode/.idea/gitHubOfficalCode.iml b/gitHubOfficalCode/.idea/gitHubOfficalCode.iml
deleted file mode 100644
index d0876a7..0000000
--- a/gitHubOfficalCode/.idea/gitHubOfficalCode.iml
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-</module>
\ No newline at end of file
diff --git a/gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml b/gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml
deleted file mode 100644
index 4ea2cf8..0000000
--- a/gitHubOfficalCode/.idea/inspectionProfiles/Project_Default.xml
+++ /dev/null
@@ -1,157 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <profile version="1.0">
-    <option name="myName" value="Project Default" />
-    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
-      <option name="ignoredErrors">
-        <list>
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-        </list>
-      </option>
-    </inspection_tool>
-  </profile>
-</component>
\ No newline at end of file
diff --git a/gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml b/gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml
deleted file mode 100644
index 105ce2d..0000000
--- a/gitHubOfficalCode/.idea/inspectionProfiles/profiles_settings.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <settings>
-    <option name="USE_PROJECT_PROFILE" value="false" />
-    <version value="1.0" />
-  </settings>
-</component>
\ No newline at end of file
diff --git a/gitHubOfficalCode/.idea/misc.xml b/gitHubOfficalCode/.idea/misc.xml
deleted file mode 100644
index a971a2c..0000000
--- a/gitHubOfficalCode/.idea/misc.xml
+++ /dev/null
@@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11" project-jdk-type="Python SDK" />
-</project>
\ No newline at end of file
diff --git a/gitHubOfficalCode/.idea/modules.xml b/gitHubOfficalCode/.idea/modules.xml
deleted file mode 100644
index c7b47ab..0000000
--- a/gitHubOfficalCode/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/gitHubOfficalCode.iml" filepath="$PROJECT_DIR$/.idea/gitHubOfficalCode.iml" />
-    </modules>
-  </component>
-</project>
\ No newline at end of file
diff --git a/gitHubOfficalCode/heaplaw.py b/heaplaw.py
similarity index 100%
rename from gitHubOfficalCode/heaplaw.py
rename to heaplaw.py
diff --git a/gitHubOfficalCode/processData.py b/processData.py
similarity index 100%
rename from gitHubOfficalCode/processData.py
rename to processData.py
diff --git a/gitHubOfficalCode/promtSelection.py b/promtSelection.py
similarity index 100%
rename from gitHubOfficalCode/promtSelection.py
rename to promtSelection.py

From 5185fc42bbaacfa6a1cec516e60f0886f6a059f2 Mon Sep 17 00:00:00 2001
From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com>
Date: Mon, 20 Nov 2023 14:54:23 -0400
Subject: [PATCH 04/25] update the table

---
 DrawThePlotAndEstimation.py | 133 +++++++++++++++++++++---------------
 1 file changed, 78 insertions(+), 55 deletions(-)

diff --git a/DrawThePlotAndEstimation.py b/DrawThePlotAndEstimation.py
index 6e89451..045f543 100644
--- a/DrawThePlotAndEstimation.py
+++ b/DrawThePlotAndEstimation.py
@@ -1,57 +1,80 @@
-import math
 import pickle
+
+import pandas as pd
 import numpy as np
-from matplotlib import pyplot as plt
-
-class pubMed:
-    def __init__(self):
-        self.data = {}
-
-    def loadDocument(self, filename, label):
-        with open(filename, 'rb') as f:
-            xy = pickle.load(f)
-            x, y = [], []
-            for xAndy in xy:
-                x.append(xAndy[0])
-                y.append(xAndy[1])
-                # x.append(math.log10(xAndy[0]))
-                # y.append(math.log10(xAndy[1]))
-            self.data[label] = (x, y)
-
-    def DrawHeapLaw(self, step):
-        colors = ['r', 'g', 'b']
-        plt.ylabel('Vocabulary Size')
-        plt.xlabel('Collection Size')
-        plt.title("Heaps' law")
-
-        for idx, (label, (x, y)) in enumerate(self.data.items()):
-            step -= 1
-            i = 0
-            print(x)
-            filtered_x, filtered_y = [], []
-            while i < len(x):
-
-                filtered_x.append(x[i])
-                filtered_y.append(y[i])
-                i += step
-            plt.plot(filtered_x, filtered_y, color=colors[idx], label=label)
-            beta  ,  logk = np.polyfit(filtered_x, filtered_y, 1)
-            print(f"Slope for {label}: {beta}")
-            print(f"Slope for {label}: {10**logk}")
-
-
-
-
-        plt.legend()
-        plt.grid(True)
-        plt.savefig('-loglog.pdf', transparent=True)
-        plt.show()
-
-
-onehu = pubMed()
-files = [ 'heapLawData-selectedPromt.pkl' , "heaplaw125m.pkl","heaplaw1.3b.pkl"]
-labels = ["PubMed","125m","1.3b" ]
-for file, label in zip(files, labels):
-    onehu.loadDocument(file, label)
-
-onehu.DrawHeapLaw(10)
+import matplotlib.pyplot as plt
+import seaborn as sns
+import statsmodels.api as sm
+
+
+def analyze_corpus(file_path, corpus_name):
+    # Read data
+
+
+    with open(file_path, 'rb') as f:
+        data = pickle.load(f)
+    data = data.drop(0)  # Drop the first row
+
+    # Fit model
+    X = sm.add_constant(np.log10(data['n']))
+    model = sm.OLS(np.log10(data['m']), X).fit()
+
+    # Summary statistics
+    alpha_hat = 10 ** model.params[0]
+    beta_hat = model.params[1]
+    rsq = model.rsquared
+    print(f"{corpus_name} - alpha (est): {alpha_hat:.4f}, beta (est): {beta_hat:.4f}, R squared: {rsq:.4f}")
+
+    # Confidence intervals
+    conf_int = model.conf_int(alpha=0.1)  # 90% CI
+    alpha_ci_low, alpha_ci_high = 10 ** conf_int.iloc[0, 0], 10 ** conf_int.iloc[0, 1]
+    beta_ci_low, beta_ci_high = conf_int.iloc[1, 0], conf_int.iloc[1, 1]
+    print(f"{corpus_name} - 90% CI for alpha: [{alpha_ci_low:.4f}, {alpha_ci_high:.4f}]")
+    print(f"{corpus_name} - 90% CI for beta: [{beta_ci_low:.4f}, {beta_ci_high:.4f}]")
+
+    return data, model.params
+
+
+
+
+pubmed_data, pubmed_params = analyze_corpus('data/pubmed.pkl', 'PubMed')
+gptneo125m_data, gptneo125m_params = analyze_corpus('data/heapLawData-gptneo125m.pkl', 'GPT-Neo-125m')
+gptneo13b_data, gptneo13b_params = analyze_corpus('data/heapLawData-gptneo1.3B.pkl', 'GPT-Neo-1.3B')
+gptneo27b_data, gptneo27b_params = analyze_corpus('data/heapLawData-gptneo2.7B.pkl', 'GPT-Neo-2.7B')
+
+
+
+def plot_corpus(data, params, corpus_name, ax, log_scale=False):
+    if log_scale:
+        sns.lineplot(x=np.log10(data['n']), y=np.log10(data['m']), ax=ax, label=f"{corpus_name}: β={params[1]:.4f}")
+    else:
+        sns.lineplot(x=data['n'], y=data['m'], ax=ax, label=f"{corpus_name}: β={params[1]:.4f}")
+
+# Natural Scale Plot
+plt.figure(figsize=(12, 8))
+ax1 = plt.subplot(1, 2, 1)
+plot_corpus(pubmed_data, pubmed_params, 'PubMed', ax1)
+plot_corpus(gptneo125m_data, gptneo125m_params, 'GPT-Neo-125m', ax1)
+plot_corpus(gptneo13b_data, gptneo13b_params, 'GPT-Neo-1.3B', ax1)
+plot_corpus(gptneo27b_data, gptneo27b_params, 'GPT-Neo-2.7B', ax1)
+plt.title("Heaps' Law - Natural Scale")
+plt.xlabel('Total Words')
+plt.ylabel('Vocabulary Size')
+plt.legend()
+
+# Log-Log Scale Plot
+ax2 = plt.subplot(1, 2, 2)
+plot_corpus(pubmed_data, pubmed_params, 'PubMed', ax2, log_scale=True)
+plot_corpus(gptneo125m_data, gptneo125m_params, 'GPT-Neo-125m', ax2, log_scale=True)
+plot_corpus(gptneo13b_data, gptneo13b_params, 'GPT-Neo-1.3B', ax2, log_scale=True)
+plot_corpus(gptneo27b_data, gptneo27b_params, 'GPT-Neo-2.7B', ax2, log_scale=True)
+plt.title("Heaps' Law - Log-Log Scale")
+plt.xlabel('Log Total Words')
+plt.ylabel('Log Vocabulary Size')
+plt.legend()
+
+plt.tight_layout()
+# plt.show()
+
+
+

From eeabf42dba5d12083a5ea87907957e2bd1036dc9 Mon Sep 17 00:00:00 2001
From: uyen lai <53119641+rachelxx03@users.noreply.github.com>
Date: Thu, 23 Nov 2023 06:11:58 -0400
Subject: [PATCH 05/25] Update README.md

---
 README.md | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 92 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4a99f5f..2e3aaa2 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,94 @@
 # Heaps' Law in GPT-Neo Large Language Model Emulated Corpora
-
 Official repository for the workshop paper Heaps' Law in GPT-Neo Large Language Model Emulated Corpora
+ArXiv preprint link: https://arxiv.org/abs/2311.06377v1
+
+## Getting Started
+
+Clone this repository by running the command
+```
+git clone https://github.com/paul-sheridan/paper-heaps-law-llm.git
+```
+and `cd` into the repository root folder `paper-heaps-law-llm`.
+
+## Data
+
+
+We download the data from The Pile, it is a big data set contain of many small data sets and **PubMed** is one of them.
+You can choose whatever dataset you want from here.
+https://pile.eleuther.ai/
+
+## Prepare the envinrontment
+
+Repository code is written in Python 3 using Narval Cluster provided by Digital Research Alliance of Canada(https://docs.alliancecan.ca/wiki/Getting_started). 
+While there are multiple ways to run a repository, here is one way to do it using Narval:
+
+From the command line, create a virtual environment:
+
+```
+virtualenv /project/def-yourName/yourDirctory
+```
+
+## Running Repository Code
+
+**DATA SELECTION**
+In this research we process 500.000 PubMed documents, so you can navigate to the file and change the amount of document or the document you want to process
+```
+python dataSelection
+```
+
+**Clean Data**
+clean the data using the method we mention in the paper
+```
+python cleanData.py
+```
+
+**Promt Selection**
+Choose the seed for the LLMs
+```
+python promtSelection.py
+```
+
+**Data Genaration**
+Generate data from LLMS using the seed we created.
+
+Navigate to folder gpt-neo-125m, gpt-neo-1.3b, gpt-neo-2.7b.
+```
+.\generate_python_scripts.sh
+.\generate_slurm_scripts.sh
+.\submit_all_jobs.sh
+```
+
+After that navigate to each folder gpt-neo-125m, gpt-neo-1.3b, gpt-neo-2.7b. and run
+```
+python decode.py
+```
+and we use the same cleaning data strageries to clean the data from LLMs
+
+**Heap's law data calculation**
+heap's law need number of vocabulary and number of total word in documents so we need to navigate and produce the result use:
+```
+python heaplaw.py
+```
+
+**Heap's law visualization**
+generate the plot using
+```
+python drawThePlotAndEstimation.py
+```
+
+
+
+****
+
+
+
+## Citation
+If you find anything useful please cite our work using:
+```
+@misc{SarriaHurtado2023,
+  author = {Uyen Lai, Gurjit S. Randhawa, Paul Sheridan},
+  title = {Heaps' Law in GPT-Neo Large Language Model Emulated Corpora},
+  year = {2023},
+  eprint = {arXiv:2311.06377v1}
+}
+```

From 6c2b9d3c7056186dfaedeeb297e0d9cde74778bc Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Thu, 23 Nov 2023 07:45:14 -0400
Subject: [PATCH 06/25] fixed citation

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2e3aaa2..51f6e39 100644
--- a/README.md
+++ b/README.md
@@ -85,7 +85,7 @@ python drawThePlotAndEstimation.py
 ## Citation
 If you find anything useful please cite our work using:
 ```
-@misc{SarriaHurtado2023,
+@misc{Lai2023,
   author = {Uyen Lai, Gurjit S. Randhawa, Paul Sheridan},
   title = {Heaps' Law in GPT-Neo Large Language Model Emulated Corpora},
   year = {2023},

From 4919772739a6b9e127fd14108ee61b7b80af80cc Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Thu, 23 Nov 2023 15:01:07 -0400
Subject: [PATCH 07/25] copy edits

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 51f6e39..0c3847f 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,5 @@
 # Heaps' Law in GPT-Neo Large Language Model Emulated Corpora
-Official repository for the workshop paper Heaps' Law in GPT-Neo Large Language Model Emulated Corpora
-ArXiv preprint link: https://arxiv.org/abs/2311.06377v1
+This repository contains computer code for reproducing the results described in the [EVIA 2023 Workshop](https://research.nii.ac.jp/ntcir/evia2023/) paper "Heaps' Law in GPT-Neo Large Language Model Emulated Corpora". ArXiv preprint link: https://arxiv.org/abs/2311.06377v1
 
 ## Getting Started
 

From 1ccac3146391777a087811496265a645de0c6f15 Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Thu, 23 Nov 2023 15:06:39 -0400
Subject: [PATCH 08/25] copy edits

---
 README.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 0c3847f..8301ba3 100644
--- a/README.md
+++ b/README.md
@@ -9,12 +9,11 @@ git clone https://github.com/paul-sheridan/paper-heaps-law-llm.git
 ```
 and `cd` into the repository root folder `paper-heaps-law-llm`.
 
-## Data
 
+## Obtaining the Data
+
+Download the **Pubmed Abstracts** component data from The Pile (https://pile.eleuther.ai/), an 800GB dataset of diverse text for language modeling.
 
-We download the data from The Pile, it is a big data set contain of many small data sets and **PubMed** is one of them.
-You can choose whatever dataset you want from here.
-https://pile.eleuther.ai/
 
 ## Prepare the envinrontment
 

From 7ed13586d3d55fe1ba97b3895b939c67023d6cd3 Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Thu, 23 Nov 2023 15:07:37 -0400
Subject: [PATCH 09/25] copy edits

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8301ba3..26e89e6 100644
--- a/README.md
+++ b/README.md
@@ -12,10 +12,10 @@ and `cd` into the repository root folder `paper-heaps-law-llm`.
 
 ## Obtaining the Data
 
-Download the **Pubmed Abstracts** component data from The Pile (https://pile.eleuther.ai/), an 800GB dataset of diverse text for language modeling.
+Download the **Pubmed Abstracts** component dataset from The Pile ([download page](https://pile.eleuther.ai/)), an 800GB dataset of diverse text for language modeling.
 
 
-## Prepare the envinrontment
+## Prepare the Envinronment
 
 Repository code is written in Python 3 using Narval Cluster provided by Digital Research Alliance of Canada(https://docs.alliancecan.ca/wiki/Getting_started). 
 While there are multiple ways to run a repository, here is one way to do it using Narval:

From 45a2282ac5f01168e3abbce8b01a69e14b7f2b21 Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Thu, 23 Nov 2023 15:08:08 -0400
Subject: [PATCH 10/25] copy edits

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 26e89e6..17f41da 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ and `cd` into the repository root folder `paper-heaps-law-llm`.
 Download the **Pubmed Abstracts** component dataset from The Pile ([download page](https://pile.eleuther.ai/)), an 800GB dataset of diverse text for language modeling.
 
 
-## Prepare the Envinronment
+## Preparing the Environment
 
 Repository code is written in Python 3 using Narval Cluster provided by Digital Research Alliance of Canada(https://docs.alliancecan.ca/wiki/Getting_started). 
 While there are multiple ways to run a repository, here is one way to do it using Narval:

From f65342ea1e74e5ba8d87850da897bc93918fcd38 Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Thu, 23 Nov 2023 15:08:38 -0400
Subject: [PATCH 11/25] copy edits

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 17f41da..a5d39df 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ and `cd` into the repository root folder `paper-heaps-law-llm`.
 
 ## Obtaining the Data
 
-Download the **Pubmed Abstracts** component dataset from The Pile ([download page](https://pile.eleuther.ai/)), an 800GB dataset of diverse text for language modeling.
+Download the **Pubmed Abstracts** component corpus from The Pile ([download page](https://pile.eleuther.ai/)), an 800GB dataset of diverse text for language modeling.
 
 
 ## Preparing the Environment

From 9b23ae0d7ba786c17247b16ecd3b9e7ccb8c159e Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Thu, 23 Nov 2023 15:09:51 -0400
Subject: [PATCH 12/25] copy edits

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a5d39df..8a7b248 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # Heaps' Law in GPT-Neo Large Language Model Emulated Corpora
-This repository contains computer code for reproducing the results described in the [EVIA 2023 Workshop](https://research.nii.ac.jp/ntcir/evia2023/) paper "Heaps' Law in GPT-Neo Large Language Model Emulated Corpora". ArXiv preprint link: https://arxiv.org/abs/2311.06377v1
+This repository contains computer code for reproducing the results described in the EVIA 2023 Workshop ([landing page](https://research.nii.ac.jp/ntcir/evia2023/)) paper "Heaps' Law in GPT-Neo Large Language Model Emulated Corpora". ArXiv preprint link: https://arxiv.org/abs/2311.06377v1
 
 ## Getting Started
 

From 43a07d30b6fb7ec671f5454eecb7186ef90cef2e Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Fri, 24 Nov 2023 06:18:17 -0400
Subject: [PATCH 13/25] copy edits

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8a7b248..dd4c70f 100644
--- a/README.md
+++ b/README.md
@@ -17,8 +17,9 @@ Download the **Pubmed Abstracts** component corpus from The Pile ([download page
 
 ## Preparing the Environment
 
-Repository code is written in Python 3 using Narval Cluster provided by Digital Research Alliance of Canada(https://docs.alliancecan.ca/wiki/Getting_started). 
-While there are multiple ways to run a repository, here is one way to do it using Narval:
+Repository code is written in Python 3. It was run on the Narval cluster ([Narval wiki page](https://docs.alliancecan.ca/wiki/Narval/en)), provided by Digital Research Alliance of Canada ([Getting started wiki page](https://docs.alliancecan.ca/wiki/Getting_started)). 
+
+While there are multiple ways to run the repository code, here is one way to do it using Narval:
 
 From the command line, create a virtual environment:
 

From ff1623a7357fce2b87c4b2b5527c2789cd7f6676 Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Fri, 24 Nov 2023 06:31:02 -0400
Subject: [PATCH 14/25] copy edits

---
 README.md | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index dd4c70f..2e9a9c6 100644
--- a/README.md
+++ b/README.md
@@ -29,25 +29,33 @@ virtualenv /project/def-yourName/yourDirctory
 
 ## Running Repository Code
 
-**DATA SELECTION**
-In this research we process 500.000 PubMed documents, so you can navigate to the file and change the amount of document or the document you want to process
+### Data Selection
+
+In this research we analyze the first 500,000 abstracts in the PubMed Abstracts corpus. To prepare this dataset, run the `dataSelection.py` script:
+
 ```
-python dataSelection
+python dataSelection.py
 ```
 
-**Clean Data**
-clean the data using the method we mention in the paper
+To select a custom number of abstracts, navigate to the `dataSelection.py` script and set the `limit` variable on line 8 to be the number of documents that you want to process.
+
+### Data Preprocessing
+
+To preprocess the data according to the steps described in the paper, run:
+
 ```
 python cleanData.py
 ```
 
-**Promt Selection**
+### Promt Selection
+
 Choose the seed for the LLMs
+
 ```
 python promtSelection.py
 ```
 
-**Data Genaration**
+### Data Genaration
 Generate data from LLMS using the seed we created.
 
 Navigate to folder gpt-neo-125m, gpt-neo-1.3b, gpt-neo-2.7b.
@@ -63,13 +71,14 @@ python decode.py
 ```
 and we use the same cleaning data strageries to clean the data from LLMs
 
-**Heap's law data calculation**
+### Heaps' Law Estimation
+
 heap's law need number of vocabulary and number of total word in documents so we need to navigate and produce the result use:
 ```
 python heaplaw.py
 ```
 
-**Heap's law visualization**
+### Heaps' Law Visualization
 generate the plot using
 ```
 python drawThePlotAndEstimation.py

From 7d8f154b940a1967431c56deb6c8eb39f3093c85 Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Fri, 24 Nov 2023 06:37:32 -0400
Subject: [PATCH 15/25] copy edits

---
 README.md | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 2e9a9c6..e944ea1 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,6 @@ Repository code is written in Python 3. It was run on the Narval cluster ([Narva
 While there are multiple ways to run the repository code, here is one way to do it using Narval:
 
 From the command line, create a virtual environment:
-
 ```
 virtualenv /project/def-yourName/yourDirctory
 ```
@@ -32,7 +31,6 @@ virtualenv /project/def-yourName/yourDirctory
 ### Data Selection
 
 In this research we analyze the first 500,000 abstracts in the PubMed Abstracts corpus. To prepare this dataset, run the `dataSelection.py` script:
-
 ```
 python dataSelection.py
 ```
@@ -42,34 +40,32 @@ To select a custom number of abstracts, navigate to the `dataSelection.py` scrip
 ### Data Preprocessing
 
 To preprocess the data according to the steps described in the paper, run:
-
 ```
 python cleanData.py
 ```
 
-### Promt Selection
-
-Choose the seed for the LLMs
+### Prompt Selection
 
+To choose seed text for abstract emulation, run:
 ```
 python promtSelection.py
 ```
 
 ### Data Genaration
-Generate data from LLMS using the seed we created.
 
-Navigate to folder gpt-neo-125m, gpt-neo-1.3b, gpt-neo-2.7b.
+To emulate text from the GPTNeo LLMs using the above generated seed texts, run the following shell scripts from inside each of the folders `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b`:
 ```
 .\generate_python_scripts.sh
 .\generate_slurm_scripts.sh
 .\submit_all_jobs.sh
 ```
 
-After that navigate to each folder gpt-neo-125m, gpt-neo-1.3b, gpt-neo-2.7b. and run
+After that navigate to each folder `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b` and run
 ```
 python decode.py
 ```
-and we use the same cleaning data strageries to clean the data from LLMs
+This script applies the same preprocesing strageries as used above.
+
 
 ### Heaps' Law Estimation
 

From 7f0842a6bac51a9234a9018182fa32ff5c92619c Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Fri, 24 Nov 2023 06:42:36 -0400
Subject: [PATCH 16/25] copy edits

---
 README.md | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index e944ea1..bc2a939 100644
--- a/README.md
+++ b/README.md
@@ -69,23 +69,20 @@ This script applies the same preprocesing strageries as used above.
 
 ### Heaps' Law Estimation
 
-heap's law need number of vocabulary and number of total word in documents so we need to navigate and produce the result use:
+To estimate the Heaps' law parameters for each GPTNeo model using simple linear regression,  navigate to each folder `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b` and run
 ```
-python heaplaw.py
+python heapsLaw.py
 ```
+The parameter estimates are found in Table 1 of the paper.
 
 ### Heaps' Law Visualization
-generate the plot using
+
+To generate the plots of Figure 1 in the paper, run
 ```
 python drawThePlotAndEstimation.py
 ```
 
 
-
-****
-
-
-
 ## Citation
 If you find anything useful please cite our work using:
 ```

From 21ebbf77367845f7f65fe4d992c6658f9f022aa1 Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Fri, 24 Nov 2023 06:45:59 -0400
Subject: [PATCH 17/25] copy edits

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index bc2a939..5c171eb 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ python promtSelection.py
 
 ### Data Genaration
 
-To emulate text from the GPTNeo LLMs using the above generated seed texts, run the following shell scripts from inside each of the folders `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b`:
+To emulate text from the GPT-Neo models using the above generated seed texts, run the following shell scripts from inside each of the folders `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b`:
 ```
 .\generate_python_scripts.sh
 .\generate_slurm_scripts.sh
@@ -69,7 +69,7 @@ This script applies the same preprocesing strageries as used above.
 
 ### Heaps' Law Estimation
 
-To estimate the Heaps' law parameters for each GPTNeo model using simple linear regression,  navigate to each folder `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b` and run
+To estimate the Heaps' law parameters for each GPT-Neo model using simple linear regression,  navigate to each folder `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b` and run
 ```
 python heapsLaw.py
 ```

From 97c3d0e1b9f6ff3bb46746ee68bdf0415925a84e Mon Sep 17 00:00:00 2001
From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com>
Date: Fri, 24 Nov 2023 15:11:18 -0400
Subject: [PATCH 18/25] update format

update format for the file name
---
 DrawThePlotAndEstimation.py | 31 +++++++++++++++++--------------
 heaplaw.py => heapsLaw.py   |  0
 2 files changed, 17 insertions(+), 14 deletions(-)
 rename heaplaw.py => heapsLaw.py (100%)

diff --git a/DrawThePlotAndEstimation.py b/DrawThePlotAndEstimation.py
index 045f543..d83618b 100644
--- a/DrawThePlotAndEstimation.py
+++ b/DrawThePlotAndEstimation.py
@@ -1,19 +1,18 @@
-import pickle
-
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 import statsmodels.api as sm
 
-
-def analyze_corpus(file_path, corpus_name):
-    # Read data
-
-
-    with open(file_path, 'rb') as f:
-        data = pickle.load(f)
-    data = data.drop(0)  # Drop the first row
+def analyze_corpus(file_path, corpus_name, file_type='csv'):
+    # Read data based on file type
+    if file_type == 'csv':
+        data = pd.read_csv(file_path, header=None, names=['n', 'm'])
+        data = data.drop(0)  # Drop the first row
+    elif file_type == 'pkl':
+        data = pd.read_pickle(file_path)
+    else:
+        raise ValueError("Unsupported file type")
 
     # Fit model
     X = sm.add_constant(np.log10(data['n']))
@@ -34,13 +33,17 @@ def analyze_corpus(file_path, corpus_name):
 
     return data, model.params
 
+pubmed_data, pubmed_params = analyze_corpus('data/pubmed.pkl', 'PubMed',file_type='csv')
+
+# Analyzing the GPT-Neo datasets (assuming they are in .pkl format)
+gptneo125m_data, gptneo125m_params = analyze_corpus('data/heapLawData-gptneo125m.pkl', 'GPT-Neo-125m', file_type='pkl')
+gptneo13b_data, gptneo13b_params = analyze_corpus('data/heapLawData-gptneo1.3B.pkl', 'GPT-Neo-1.3B', file_type='pkl')
+gptneo27b_data, gptneo27b_params = analyze_corpus('data/heapLawData-gptneo2.7B.pkl', 'GPT-Neo-2.7B', file_type='pkl')
+
+# Rest of the code remains the same
 
 
 
-pubmed_data, pubmed_params = analyze_corpus('data/pubmed.pkl', 'PubMed')
-gptneo125m_data, gptneo125m_params = analyze_corpus('data/heapLawData-gptneo125m.pkl', 'GPT-Neo-125m')
-gptneo13b_data, gptneo13b_params = analyze_corpus('data/heapLawData-gptneo1.3B.pkl', 'GPT-Neo-1.3B')
-gptneo27b_data, gptneo27b_params = analyze_corpus('data/heapLawData-gptneo2.7B.pkl', 'GPT-Neo-2.7B')
 
 
 
diff --git a/heaplaw.py b/heapsLaw.py
similarity index 100%
rename from heaplaw.py
rename to heapsLaw.py

From bd0f2146bd20bdc15d783f9a74f3503785d8f55c Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Fri, 24 Nov 2023 15:15:04 -0400
Subject: [PATCH 19/25] file name update

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5c171eb..4fcfbd3 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ python cleanData.py
 
 To choose seed text for abstract emulation, run:
 ```
-python promtSelection.py
+python promptSelection.py
 ```
 
 ### Data Genaration

From 3db2244cd4f4bdcf4e4cca50d902afe6a0a73dfc Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Fri, 24 Nov 2023 15:16:32 -0400
Subject: [PATCH 20/25] typo fix

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4fcfbd3..79b137b 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ While there are multiple ways to run the repository code, here is one way to do
 
 From the command line, create a virtual environment:
 ```
-virtualenv /project/def-yourName/yourDirctory
+virtualenv /project/def-yourName/yourDirectory
 ```
 
 ## Running Repository Code

From cd24aaf7ca70c863837549193ac8bbeb53428044 Mon Sep 17 00:00:00 2001
From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com>
Date: Fri, 24 Nov 2023 15:18:07 -0400
Subject: [PATCH 21/25] format

---
 .idea/.gitignore                              |   3 +
 .idea/inspectionProfiles/Project_Default.xml  | 193 ++++++++++++++++++
 .../inspectionProfiles/profiles_settings.xml  |   6 +
 .idea/modules.xml                             |   8 +
 .idea/paper-heaps-law-llm.iml                 |   8 +
 .idea/vcs.xml                                 |   6 +
 6 files changed, 224 insertions(+)
 create mode 100644 .idea/.gitignore
 create mode 100644 .idea/inspectionProfiles/Project_Default.xml
 create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 .idea/paper-heaps-law-llm.iml
 create mode 100644 .idea/vcs.xml

diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..48285f1
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,193 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+          <option value="W29" />
+          <option value="E501" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..03b57fd
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/paper-heaps-law-llm.iml" filepath="$PROJECT_DIR$/.idea/paper-heaps-law-llm.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/paper-heaps-law-llm.iml b/.idea/paper-heaps-law-llm.iml
new file mode 100644
index 0000000..d0876a7
--- /dev/null
+++ b/.idea/paper-heaps-law-llm.iml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file

From 4aec00d31a075d1542699b472a4e11bf3317c102 Mon Sep 17 00:00:00 2001
From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com>
Date: Fri, 24 Nov 2023 15:23:31 -0400
Subject: [PATCH 22/25] change thr prompt file name

---
 promtSelection.py => promptSelection.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename promtSelection.py => promptSelection.py (100%)

diff --git a/promtSelection.py b/promptSelection.py
similarity index 100%
rename from promtSelection.py
rename to promptSelection.py

From 713e981f3cedcfc0abf199472721d0ce47305184 Mon Sep 17 00:00:00 2001
From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com>
Date: Fri, 24 Nov 2023 15:28:02 -0400
Subject: [PATCH 23/25] commit

---
 DrawThePlotAndEstimation.py          | 83 ----------------------------
 DataSelection.py => dataSelection.py |  0
 drawThePlotAndEstimattion.py         | 31 +++++++++++
 3 files changed, 31 insertions(+), 83 deletions(-)
 delete mode 100644 DrawThePlotAndEstimation.py
 rename DataSelection.py => dataSelection.py (100%)
 create mode 100644 drawThePlotAndEstimattion.py

diff --git a/DrawThePlotAndEstimation.py b/DrawThePlotAndEstimation.py
deleted file mode 100644
index d83618b..0000000
--- a/DrawThePlotAndEstimation.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-import seaborn as sns
-import statsmodels.api as sm
-
-def analyze_corpus(file_path, corpus_name, file_type='csv'):
-    # Read data based on file type
-    if file_type == 'csv':
-        data = pd.read_csv(file_path, header=None, names=['n', 'm'])
-        data = data.drop(0)  # Drop the first row
-    elif file_type == 'pkl':
-        data = pd.read_pickle(file_path)
-    else:
-        raise ValueError("Unsupported file type")
-
-    # Fit model
-    X = sm.add_constant(np.log10(data['n']))
-    model = sm.OLS(np.log10(data['m']), X).fit()
-
-    # Summary statistics
-    alpha_hat = 10 ** model.params[0]
-    beta_hat = model.params[1]
-    rsq = model.rsquared
-    print(f"{corpus_name} - alpha (est): {alpha_hat:.4f}, beta (est): {beta_hat:.4f}, R squared: {rsq:.4f}")
-
-    # Confidence intervals
-    conf_int = model.conf_int(alpha=0.1)  # 90% CI
-    alpha_ci_low, alpha_ci_high = 10 ** conf_int.iloc[0, 0], 10 ** conf_int.iloc[0, 1]
-    beta_ci_low, beta_ci_high = conf_int.iloc[1, 0], conf_int.iloc[1, 1]
-    print(f"{corpus_name} - 90% CI for alpha: [{alpha_ci_low:.4f}, {alpha_ci_high:.4f}]")
-    print(f"{corpus_name} - 90% CI for beta: [{beta_ci_low:.4f}, {beta_ci_high:.4f}]")
-
-    return data, model.params
-
-pubmed_data, pubmed_params = analyze_corpus('data/pubmed.pkl', 'PubMed',file_type='csv')
-
-# Analyzing the GPT-Neo datasets (assuming they are in .pkl format)
-gptneo125m_data, gptneo125m_params = analyze_corpus('data/heapLawData-gptneo125m.pkl', 'GPT-Neo-125m', file_type='pkl')
-gptneo13b_data, gptneo13b_params = analyze_corpus('data/heapLawData-gptneo1.3B.pkl', 'GPT-Neo-1.3B', file_type='pkl')
-gptneo27b_data, gptneo27b_params = analyze_corpus('data/heapLawData-gptneo2.7B.pkl', 'GPT-Neo-2.7B', file_type='pkl')
-
-# Rest of the code remains the same
-
-
-
-
-
-
-def plot_corpus(data, params, corpus_name, ax, log_scale=False):
-    if log_scale:
-        sns.lineplot(x=np.log10(data['n']), y=np.log10(data['m']), ax=ax, label=f"{corpus_name}: β={params[1]:.4f}")
-    else:
-        sns.lineplot(x=data['n'], y=data['m'], ax=ax, label=f"{corpus_name}: β={params[1]:.4f}")
-
-# Natural Scale Plot
-plt.figure(figsize=(12, 8))
-ax1 = plt.subplot(1, 2, 1)
-plot_corpus(pubmed_data, pubmed_params, 'PubMed', ax1)
-plot_corpus(gptneo125m_data, gptneo125m_params, 'GPT-Neo-125m', ax1)
-plot_corpus(gptneo13b_data, gptneo13b_params, 'GPT-Neo-1.3B', ax1)
-plot_corpus(gptneo27b_data, gptneo27b_params, 'GPT-Neo-2.7B', ax1)
-plt.title("Heaps' Law - Natural Scale")
-plt.xlabel('Total Words')
-plt.ylabel('Vocabulary Size')
-plt.legend()
-
-# Log-Log Scale Plot
-ax2 = plt.subplot(1, 2, 2)
-plot_corpus(pubmed_data, pubmed_params, 'PubMed', ax2, log_scale=True)
-plot_corpus(gptneo125m_data, gptneo125m_params, 'GPT-Neo-125m', ax2, log_scale=True)
-plot_corpus(gptneo13b_data, gptneo13b_params, 'GPT-Neo-1.3B', ax2, log_scale=True)
-plot_corpus(gptneo27b_data, gptneo27b_params, 'GPT-Neo-2.7B', ax2, log_scale=True)
-plt.title("Heaps' Law - Log-Log Scale")
-plt.xlabel('Log Total Words')
-plt.ylabel('Log Vocabulary Size')
-plt.legend()
-
-plt.tight_layout()
-# plt.show()
-
-
-
diff --git a/DataSelection.py b/dataSelection.py
similarity index 100%
rename from DataSelection.py
rename to dataSelection.py
diff --git a/drawThePlotAndEstimattion.py b/drawThePlotAndEstimattion.py
new file mode 100644
index 0000000..8152087
--- /dev/null
+++ b/drawThePlotAndEstimattion.py
@@ -0,0 +1,31 @@
+import json
+import pickle
+from processData import process_data
+
+def main():
+    file_path = 'data/test.jsonl'
+    processed_data = []
+    limit = 500000
+    prompt_length = 5
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                data_chunk = json.loads(line).get('text', '')  # Assuming each line is a JSON object with an 'abstract' field
+                processed_line = process_data(data_chunk)
+                if len(processed_line) > prompt_length:  # Check if the processed line has more than 10 words
+                    processed_data.append(processed_line)
+                    if len(processed_data) >= limit:
+                        break
+        with open('processData.pickle', 'wb') as f:
+            pickle.dump(processed_data, f)
+        print("Data has been processed and saved to 'processData.pickle'")
+    except FileNotFoundError:
+        print("The file was not found. Please check the file path.")
+    except json.JSONDecodeError:
+        print("Error decoding JSON. Please check the file content.")
+    except Exception as e:
+        print("An error occurred:", str(e))
+
+if __name__ == "__main__":
+    main()
+

From 660dfbc46801f1b18967dad3e229db172886d04d Mon Sep 17 00:00:00 2001
From: rachelxx03 <53119641+rachelxx03@users.noreply.github.com>
Date: Fri, 24 Nov 2023 15:28:51 -0400
Subject: [PATCH 24/25] delete

---
 .idea/.gitignore                              |   3 -
 .idea/inspectionProfiles/Project_Default.xml  | 193 ------------------
 .../inspectionProfiles/profiles_settings.xml  |   6 -
 .idea/modules.xml                             |   8 -
 .idea/paper-heaps-law-llm.iml                 |   8 -
 .idea/vcs.xml                                 |   6 -
 6 files changed, 224 deletions(-)
 delete mode 100644 .idea/.gitignore
 delete mode 100644 .idea/inspectionProfiles/Project_Default.xml
 delete mode 100644 .idea/inspectionProfiles/profiles_settings.xml
 delete mode 100644 .idea/modules.xml
 delete mode 100644 .idea/paper-heaps-law-llm.iml
 delete mode 100644 .idea/vcs.xml

diff --git a/.idea/.gitignore b/.idea/.gitignore
deleted file mode 100644
index 26d3352..0000000
--- a/.idea/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
deleted file mode 100644
index 48285f1..0000000
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ /dev/null
@@ -1,193 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <profile version="1.0">
-    <option name="myName" value="Project Default" />
-    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
-      <option name="ignoredErrors">
-        <list>
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-          <option value="W29" />
-          <option value="E501" />
-        </list>
-      </option>
-    </inspection_tool>
-  </profile>
-</component>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
deleted file mode 100644
index 105ce2d..0000000
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <settings>
-    <option name="USE_PROJECT_PROFILE" value="false" />
-    <version value="1.0" />
-  </settings>
-</component>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
deleted file mode 100644
index 03b57fd..0000000
--- a/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/paper-heaps-law-llm.iml" filepath="$PROJECT_DIR$/.idea/paper-heaps-law-llm.iml" />
-    </modules>
-  </component>
-</project>
\ No newline at end of file
diff --git a/.idea/paper-heaps-law-llm.iml b/.idea/paper-heaps-law-llm.iml
deleted file mode 100644
index d0876a7..0000000
--- a/.idea/paper-heaps-law-llm.iml
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-</module>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
deleted file mode 100644
index 94a25f7..0000000
--- a/.idea/vcs.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="VcsDirectoryMappings">
-    <mapping directory="$PROJECT_DIR$" vcs="Git" />
-  </component>
-</project>
\ No newline at end of file

From 56974355307aaa517ba482b6e662c6eb2fdc6c2e Mon Sep 17 00:00:00 2001
From: Paul Sheridan <paul.sheridan.stats@gmail.com>
Date: Fri, 24 Nov 2023 15:34:15 -0400
Subject: [PATCH 25/25] copy edits

---
 README.md | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 79b137b..a7a21ab 100644
--- a/README.md
+++ b/README.md
@@ -67,17 +67,14 @@ python decode.py
 This script applies the same preprocesing strageries as used above.
 
 
-### Heaps' Law Estimation
+### Heaps' Law Estimation and Visualization
 
-To estimate the Heaps' law parameters for each GPT-Neo model using simple linear regression,  navigate to each folder `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b` and run
+To prepare the emulated texts for analysis, navigate to each folder `gpt-neo-125m`, `gpt-neo-1.3b`, `gpt-neo-2.7b` and run
 ```
 python heapsLaw.py
 ```
-The parameter estimates are found in Table 1 of the paper.
 
-### Heaps' Law Visualization
-
-To generate the plots of Figure 1 in the paper, run
+To generate the plots of Figure 1 and Heaps' law parameter estimates of Table 1, run
 ```
 python drawThePlotAndEstimation.py
 ```