From c2a2dce22d8f4d31832df73853f431726df6e973 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Mon, 19 Aug 2024 13:34:19 -0400
Subject: [PATCH 01/32] create rake task and service

---
 .../tasks/download_stats_migration_service.rb |  9 +++++++
 lib/tasks/migrate_download_stats.rake         | 26 +++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 app/services/tasks/download_stats_migration_service.rb
 create mode 100644 lib/tasks/migrate_download_stats.rake

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
new file mode 100644
index 000000000..d6ccc5aba
--- /dev/null
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -0,0 +1,9 @@
+# frozen_string_literal: true
+module Tasks
+  class DownloadStatsMigrationService
+    PAGE_SIZE = 1000
+    def list_object_ids(output_path, after_timestamp = nil)
+      puts 'Test - Listing object ids for download stats migration'
+    end
+  end
+end
diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake
new file mode 100644
index 000000000..12f3e4a3a
--- /dev/null
+++ b/lib/tasks/migrate_download_stats.rake
@@ -0,0 +1,26 @@
+# frozen_string_literal: true
+require 'time'
+require 'optparse'
+require 'optparse/date'
+
+namespace :migrate_download_stats do
+  desc 'list object ids for download stats migration'
+  task :list_ids, [:output_dir, :after] => :environment do |_t, _args|
+    start_time = Time.now
+    puts "[#{start_time.utc.iso8601}] starting listing of ids"
+    options = {}
+
+    opts = OptionParser.new
+    opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_ids -- [options]'
+    opts.on('-o', '--output-dir ARG', String, 'Directory list will be saved to') { |val| options[:output_dir] = val }
+    opts.on('-a', '--after ARG', String, 'List objects which have been updated after this timestamp') { |val| options[:after] = val }
+    args = opts.order!(ARGV) {}
+    opts.parse!(args)
+
+    file_path = Tasks::DownloadStatsMigrationService.new.list_object_ids(options[:output_dir], options[:after])
+
+    puts "Listing completed in #{Time.now - start_time}s"
+    puts "Stored id list to file: #{file_path}"
+    exit 0
+  end
+end

From fc8dbe3b088cab8dd21f66f63a12e8178e4fd2d5 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Mon, 19 Aug 2024 15:04:01 -0400
Subject: [PATCH 02/32] tests for listing ids in the download stats migration
 service

---
 .../tasks/download_stats_migration_service.rb | 15 +++++-
 spec/factories/file_download_stats.rb         | 12 +++++
 .../download_stats_migration_service_spec.rb  | 46 +++++++++++++++++++
 3 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 spec/factories/file_download_stats.rb
 create mode 100644 spec/services/tasks/download_stats_migration_service_spec.rb

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index d6ccc5aba..2d9a11b9a 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -3,7 +3,20 @@ module Tasks
   class DownloadStatsMigrationService
     PAGE_SIZE = 1000
     def list_object_ids(output_path, after_timestamp = nil)
-      puts 'Test - Listing object ids for download stats migration'
+      # Build the query
+      query = FileDownloadStat.select(:id)
+      query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
+
+      # Fetch the IDs in batches
+      ids = []
+      query.find_in_batches(batch_size: PAGE_SIZE) do |batch|
+        ids.concat(batch.map(&:id))
+      end
+
+      # Write the IDs to the specified output file
+      File.open(output_path, 'w') do |file|
+        ids.each { |id| file.puts(id) }
+      end
     end
   end
 end
diff --git a/spec/factories/file_download_stats.rb b/spec/factories/file_download_stats.rb
new file mode 100644
index 000000000..48f136d5a
--- /dev/null
+++ b/spec/factories/file_download_stats.rb
@@ -0,0 +1,12 @@
+# frozen_string_literal: true
+FactoryBot.define do
+  factory :file_download_stat do
+    sequence(:id) { |n| n }  # Auto-incrementing ID
+    date { FFaker::Time.between(Date.new(2019, 1, 1), Date.new(2024, 12, 31)) }  # Random date between a range
+    downloads { rand(1..50) }
+    sequence(:file_id) { |n| "file_id_#{n}" }  # Unique file ID for each record
+    created_at { date }
+    updated_at { date }
+    user_id { rand(1..100) }
+  end
+end
diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb
new file mode 100644
index 000000000..5d2ca38a4
--- /dev/null
+++ b/spec/services/tasks/download_stats_migration_service_spec.rb
@@ -0,0 +1,46 @@
+# frozen_string_literal: true
+require 'rails_helper'
+
+RSpec.describe Tasks::DownloadStatsMigrationService, type: :service do
+  describe '#list_object_ids' do
+    let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.txt') }
+    let!(:file_download_stats) { FactoryBot.create_list(:file_download_stat, 10) }
+
+    before do
+      # Ensure the output file is removed before each test
+      File.delete(output_path) if File.exist?(output_path)
+    end
+
+    it 'writes all IDs to the output file' do
+      service = described_class.new
+      service.list_object_ids(output_path, nil)
+
+      expect(File).to exist(output_path)
+      output = File.read(output_path)
+      expected_ids = FileDownloadStat.pluck(:id)
+      output_ids = output.split("\n").map(&:to_i)
+
+      expect(output_ids).to match_array(expected_ids)
+    end
+
+    context 'with an after_timestamp' do
+      let(:after_timestamp) { 1.day.ago }
+      let!(:recent_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: 1.hour.ago) }
+
+      it 'filters records by the given timestamp' do
+        # Create a file_download_stat that should not be included
+        old_stat = FactoryBot.create(:file_download_stat, updated_at: 2.days.ago)
+
+        service = described_class.new
+        service.list_object_ids(output_path, after_timestamp)
+
+        output = File.read(output_path)
+        expected_ids = recent_stats.pluck(:id)
+        output_ids = output.split("\n").map(&:to_i)
+
+        expect(output_ids).not_to include(old_stat.id)
+        expect(output_ids).to match_array(expected_ids)
+      end
+    end
+  end
+end

From e7a35625ad42b75376d8ff97504c83c7cb10976d Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Mon, 19 Aug 2024 15:28:02 -0400
Subject: [PATCH 03/32] update date range in factory

---
 spec/factories/file_download_stats.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spec/factories/file_download_stats.rb b/spec/factories/file_download_stats.rb
index 48f136d5a..8ad1892e5 100644
--- a/spec/factories/file_download_stats.rb
+++ b/spec/factories/file_download_stats.rb
@@ -2,7 +2,7 @@
 FactoryBot.define do
   factory :file_download_stat do
     sequence(:id) { |n| n }  # Auto-incrementing ID
-    date { FFaker::Time.between(Date.new(2019, 1, 1), Date.new(2024, 12, 31)) }  # Random date between a range
+    date { FFaker::Time.between(Date.new(2019, 1, 1), Date.new(2022, 1, 31)) }  # Random date between a range
     downloads { rand(1..50) }
     sequence(:file_id) { |n| "file_id_#{n}" }  # Unique file ID for each record
     created_at { date }

From 33bea2ca06b8bb7835931981bb5fa47ea44a56fb Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Wed, 21 Aug 2024 12:15:02 -0400
Subject: [PATCH 04/32] expand function to retrieve fields besides id, test
 refactor, enforce csv dir

---
 .../tasks/download_stats_migration_service.rb | 27 ++++++---
 lib/tasks/migrate_download_stats.rake         |  7 ++-
 .../download_stats_migration_service_spec.rb  | 55 ++++++++++++++-----
 3 files changed, 67 insertions(+), 22 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index 2d9a11b9a..36ca1c6ef 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -2,20 +2,33 @@
 module Tasks
   class DownloadStatsMigrationService
     PAGE_SIZE = 1000
-    def list_object_ids(output_path, after_timestamp = nil)
+    def list_record_info(output_path, after_timestamp = nil)
+
       # Build the query
-      query = FileDownloadStat.select(:id)
+      query = FileDownloadStat.select(:id, :date, :downloads, :file_id)
       query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
 
       # Fetch the IDs in batches
-      ids = []
+      records = []
       query.find_in_batches(batch_size: PAGE_SIZE) do |batch|
-        ids.concat(batch.map(&:id))
+        records.concat(batch.map { |record| { id: record.id, date: record.date, downloads: record.downloads, file_id: record.file_id } })
+      end
+
+      # Write the records to the specified CSV file
+      CSV.open(output_path, 'w', write_headers: true, headers: ['id', 'date', 'downloads', 'file_id']) do |csv|
+        records.each do |record|
+          csv << [record[:id], record[:date], record[:downloads], record[:file_id]]
+        end
       end
+    end
+
+    def replicate_to_new_table(id_list_file, clean_index)
+      # Read the list of IDs from the file and strip whitespace
+      ids = File.readlines(id_list_file).map(&:strip)
 
-      # Write the IDs to the specified output file
-      File.open(output_path, 'w') do |file|
-        ids.each { |id| file.puts(id) }
+      # Reindex the objects
+      ids.each do |id|
+        hyc_download_stat = HycDownloadStat.find_or_initialize_by(id)
       end
     end
   end
diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake
index 12f3e4a3a..09926a995 100644
--- a/lib/tasks/migrate_download_stats.rake
+++ b/lib/tasks/migrate_download_stats.rake
@@ -17,7 +17,12 @@ namespace :migrate_download_stats do
     args = opts.order!(ARGV) {}
     opts.parse!(args)
 
-    file_path = Tasks::DownloadStatsMigrationService.new.list_object_ids(options[:output_dir], options[:after])
+    unless options[:output_dir].present? && options[:output_dir][-4] == '.csv'
+      puts 'Please provide a valid output directory with a .csv extension'
+      exit 1
+    end
+
+    file_path = Tasks::DownloadStatsMigrationService.new.list_record_info(options[:output_dir], options[:after])
 
     puts "Listing completed in #{Time.now - start_time}s"
     puts "Stored id list to file: #{file_path}"
diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb
index 5d2ca38a4..c709af5c0 100644
--- a/spec/services/tasks/download_stats_migration_service_spec.rb
+++ b/spec/services/tasks/download_stats_migration_service_spec.rb
@@ -2,8 +2,8 @@
 require 'rails_helper'
 
 RSpec.describe Tasks::DownloadStatsMigrationService, type: :service do
-  describe '#list_object_ids' do
-    let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.txt') }
+  describe '#list_record_info' do
+    let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') }
     let!(:file_download_stats) { FactoryBot.create_list(:file_download_stat, 10) }
 
     before do
@@ -11,16 +11,26 @@
       File.delete(output_path) if File.exist?(output_path)
     end
 
-    it 'writes all IDs to the output file' do
+    it 'writes all records to the output CSV file' do
       service = described_class.new
-      service.list_object_ids(output_path, nil)
+      service.list_record_info(output_path, nil)
 
       expect(File).to exist(output_path)
-      output = File.read(output_path)
-      expected_ids = FileDownloadStat.pluck(:id)
-      output_ids = output.split("\n").map(&:to_i)
 
-      expect(output_ids).to match_array(expected_ids)
+      # Read and parse the CSV file
+      csv_data = CSV.read(output_path, headers: true)
+      output_records = csv_data.map { |row| row.to_h.symbolize_keys }
+
+      expected_records = file_download_stats.map do |stat|
+        {
+          id: stat.id.to_s,
+          date: stat.date.to_s,
+          downloads: stat.downloads.to_s,
+          file_id: stat.file_id
+        }
+      end
+
+      expect(output_records).to match_array(expected_records)
     end
 
     context 'with an after_timestamp' do
@@ -32,14 +42,31 @@
         old_stat = FactoryBot.create(:file_download_stat, updated_at: 2.days.ago)
 
         service = described_class.new
-        service.list_object_ids(output_path, after_timestamp)
+        service.list_record_info(output_path, after_timestamp)
+
+        expect(File).to exist(output_path)
+
+        # Read and parse the CSV file
+        csv_data = CSV.read(output_path, headers: true)
+        output_records = csv_data.map { |row| row.to_h.symbolize_keys }
 
-        output = File.read(output_path)
-        expected_ids = recent_stats.pluck(:id)
-        output_ids = output.split("\n").map(&:to_i)
+        expected_records = recent_stats.map do |stat|
+          {
+            id: stat.id.to_s,
+            date: stat.date.to_s,
+            downloads: stat.downloads.to_s,
+            file_id: stat.file_id
+          }
+        end
 
-        expect(output_ids).not_to include(old_stat.id)
-        expect(output_ids).to match_array(expected_ids)
+        # Ensure old_stat is not included and recent_stats are
+        expect(output_records).not_to include(
+          id: old_stat.id.to_s,
+          date: old_stat.date.to_s,
+          downloads: old_stat.downloads.to_s,
+          file_id: old_stat.file_id
+        )
+        expect(output_records).to match_array(expected_records)
       end
     end
   end

From 6a1b8ed7b288c9ac39d1d3fd3dc6de44d29487cd Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Wed, 21 Aug 2024 14:02:50 -0400
Subject: [PATCH 05/32] truncate by date, update or create stat function

---
 .../tasks/download_stats_migration_service.rb | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index 36ca1c6ef..c1a3bbe7b 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -5,10 +5,13 @@ class DownloadStatsMigrationService
     def list_record_info(output_path, after_timestamp = nil)
 
       # Build the query
-      query = FileDownloadStat.select(:id, :date, :downloads, :file_id)
+      query = FileDownloadStat.select("file_id, DATE_TRUNC('month', date) AS date, SUM(downloads) AS downloads")
       query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
 
-      # Fetch the IDs in batches
+      # Group by file_id and truncated month
+      query = query.group("file_id, DATE_TRUNC('month', date)").order('file_id, month')
+
+      # Fetch the IDs in batches for memory efficiency and performance
       records = []
       query.find_in_batches(batch_size: PAGE_SIZE) do |batch|
         records.concat(batch.map { |record| { id: record.id, date: record.date, downloads: record.downloads, file_id: record.file_id } })
@@ -22,14 +25,25 @@ def list_record_info(output_path, after_timestamp = nil)
       end
     end
 
-    def replicate_to_new_table(id_list_file, clean_index)
-      # Read the list of IDs from the file and strip whitespace
-      ids = File.readlines(id_list_file).map(&:strip)
+    def migrate_to_new_table(csv_path)
+      csv_data = CSV.read(csv_path, headers: true)
+      records = csv_data.map { |row| row.to_h.symbolize_keys }
 
-      # Reindex the objects
-      ids.each do |id|
-        hyc_download_stat = HycDownloadStat.find_or_initialize_by(id)
+      # Recreate or update objects in new table
+      records.each do |stat|
+        update_or_create_stat(stat)
       end
     end
+
+    def update_or_create_stat(stat)
+      hyc_download_stat = HycDownloadStat.find_or_initialize_by(id: stat.id.to_s)
+      hyc_download_stat.assign_attributes(
+        date: stat.date,
+        downloads: stat.downloads,
+        file_id: stat.file_id,
+        user_id: stat.user_id
+      )
+      hyc_download_stat.save
+    end
   end
 end

From fd91dfee9625af66512670ae1b17a0043eb6a329 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Wed, 21 Aug 2024 14:59:09 -0400
Subject: [PATCH 06/32] addressing grouping errors

---
 .../tasks/download_stats_migration_service.rb | 23 +++---
 .../download_stats_migration_service_spec.rb  | 78 +++++++++----------
 2 files changed, 48 insertions(+), 53 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index c1a3bbe7b..f5f6e40b9 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -5,22 +5,22 @@ class DownloadStatsMigrationService
     def list_record_info(output_path, after_timestamp = nil)
 
       # Build the query
-      query = FileDownloadStat.select("file_id, DATE_TRUNC('month', date) AS date, SUM(downloads) AS downloads")
+      query = FileDownloadStat.select("id, file_id, DATE_TRUNC('month', date) AS date, SUM(downloads) AS downloads")
       query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
 
       # Group by file_id and truncated month
-      query = query.group("file_id, DATE_TRUNC('month', date)").order('file_id, month')
+      query = query.group("id, file_id, date, downloads").order('file_id, date')
 
       # Fetch the IDs in batches for memory efficiency and performance
       records = []
-      query.find_in_batches(batch_size: PAGE_SIZE) do |batch|
-        records.concat(batch.map { |record| { id: record.id, date: record.date, downloads: record.downloads, file_id: record.file_id } })
+      query.in_batches(of: PAGE_SIZE, load: true) do |relation|
+        records.concat(relation.map { |record| { file_id: record.file_id, date: record.date, downloads: record.downloads } })
       end
 
       # Write the records to the specified CSV file
-      CSV.open(output_path, 'w', write_headers: true, headers: ['id', 'date', 'downloads', 'file_id']) do |csv|
+      CSV.open(output_path, 'w', write_headers: true, headers: ['file_id', 'date', 'downloads']) do |csv|
         records.each do |record|
-          csv << [record[:id], record[:date], record[:downloads], record[:file_id]]
+          csv << [record[:file_id], record[:date], record[:downloads]]
         end
       end
     end
@@ -36,12 +36,13 @@ def migrate_to_new_table(csv_path)
     end
 
     def update_or_create_stat(stat)
-      hyc_download_stat = HycDownloadStat.find_or_initialize_by(id: stat.id.to_s)
+      puts "Inspect stat: #{stat.inspect}"
+      puts "Stat file_id: #{stat[:file_id]}"
+      hyc_download_stat = HycDownloadStat.find_or_initialize_by(fileset_id: stat[:file_id].to_s)
       hyc_download_stat.assign_attributes(
-        date: stat.date,
-        downloads: stat.downloads,
-        file_id: stat.file_id,
-        user_id: stat.user_id
+        date: stat[:date],
+        download_count: stat[:downloads],
+        fileset_id: stat[:file_id]
       )
       hyc_download_stat.save
     end
diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb
index c709af5c0..a30ed59ff 100644
--- a/spec/services/tasks/download_stats_migration_service_spec.rb
+++ b/spec/services/tasks/download_stats_migration_service_spec.rb
@@ -2,26 +2,22 @@
 require 'rails_helper'
 
 RSpec.describe Tasks::DownloadStatsMigrationService, type: :service do
-  describe '#list_record_info' do
-    let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') }
-    let!(:file_download_stats) { FactoryBot.create_list(:file_download_stat, 10) }
-
-    before do
-      # Ensure the output file is removed before each test
-      File.delete(output_path) if File.exist?(output_path)
-    end
-
-    it 'writes all records to the output CSV file' do
-      service = described_class.new
-      service.list_record_info(output_path, nil)
+  let!(:file_download_stats) { FactoryBot.create_list(:file_download_stat, 10) }
+  let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') }
+  let(:csv_data) { CSV.read(output_path, headers: true) }
+  let(:output_records) { csv_data.map { |row| row.to_h.symbolize_keys } }
+  let(:service) { described_class.new }
 
-      expect(File).to exist(output_path)
 
-      # Read and parse the CSV file
-      csv_data = CSV.read(output_path, headers: true)
-      output_records = csv_data.map { |row| row.to_h.symbolize_keys }
+  before do
+    # Ensure the output file is removed before each test
+    File.delete(output_path) if File.exist?(output_path)
+  end
 
-      expected_records = file_download_stats.map do |stat|
+  describe '#list_record_info' do
+    # Helper method to convert an array of FileDownloadStat objects to an array of hashes
+    def expected_records_for(stats)
+      stats.map do |stat|
         {
           id: stat.id.to_s,
           date: stat.date.to_s,
@@ -29,44 +25,42 @@
           file_id: stat.file_id
         }
       end
+    end
 
-      expect(output_records).to match_array(expected_records)
+    it 'writes all records to the output CSV file' do
+      service.list_record_info(output_path, nil)
+
+      expect(File).to exist(output_path)
+      expect(output_records).to match_array(expected_records_for(file_download_stats))
     end
 
     context 'with an after_timestamp' do
       let(:after_timestamp) { 1.day.ago }
       let!(:recent_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: 1.hour.ago) }
+      let!(:old_stat) { FactoryBot.create(:file_download_stat, updated_at: 2.days.ago) }
 
       it 'filters records by the given timestamp' do
-        # Create a file_download_stat that should not be included
-        old_stat = FactoryBot.create(:file_download_stat, updated_at: 2.days.ago)
-
-        service = described_class.new
         service.list_record_info(output_path, after_timestamp)
 
         expect(File).to exist(output_path)
+        expect(output_records).not_to include(expected_records_for([old_stat]).first)
+        expect(output_records).to match_array(expected_records_for(recent_stats))
+      end
+    end
+  end
 
-        # Read and parse the CSV file
-        csv_data = CSV.read(output_path, headers: true)
-        output_records = csv_data.map { |row| row.to_h.symbolize_keys }
-
-        expected_records = recent_stats.map do |stat|
-          {
-            id: stat.id.to_s,
-            date: stat.date.to_s,
-            downloads: stat.downloads.to_s,
-            file_id: stat.file_id
-          }
-        end
+  describe '#migrate_to_new_table' do
+    before do
+      service.list_record_info(output_path, nil)
+      service.migrate_to_new_table(output_path)
+    end
 
-        # Ensure old_stat is not included and recent_stats are
-        expect(output_records).not_to include(
-          id: old_stat.id.to_s,
-          date: old_stat.date.to_s,
-          downloads: old_stat.downloads.to_s,
-          file_id: old_stat.file_id
-        )
-        expect(output_records).to match_array(expected_records)
+    it 'creates new HycDownloadStat records from the CSV file' do
+      output_records.each do |record|
+        hyc_download_stat = HycDownloadStat.find(record[:id])
+        expect(hyc_download_stat.date.to_s).to eq(record[:date])
+        expect(hyc_download_stat.downloads).to eq(record[:downloads].to_i)
+        expect(hyc_download_stat.file_id).to eq(record[:file_id])
       end
     end
   end

From 143345c666c07db5f4ff433cd81f5d134d91175a Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Wed, 21 Aug 2024 15:25:30 -0400
Subject: [PATCH 07/32] check for truncated date in tests

---
 .../tasks/download_stats_migration_service.rb |  2 +-
 .../download_stats_migration_service_spec.rb  | 20 +++++++++++++++----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index f5f6e40b9..08ed9660d 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -9,7 +9,7 @@ def list_record_info(output_path, after_timestamp = nil)
       query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
 
       # Group by file_id and truncated month
-      query = query.group("id, file_id, date, downloads").order('file_id, date')
+      query = query.group('id, file_id, date, downloads').order('file_id, date')
 
       # Fetch the IDs in batches for memory efficiency and performance
       records = []
diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb
index a30ed59ff..f3cc23f55 100644
--- a/spec/services/tasks/download_stats_migration_service_spec.rb
+++ b/spec/services/tasks/download_stats_migration_service_spec.rb
@@ -2,7 +2,19 @@
 require 'rails_helper'
 
 RSpec.describe Tasks::DownloadStatsMigrationService, type: :service do
-  let!(:file_download_stats) { FactoryBot.create_list(:file_download_stat, 10) }
+  let!(:file_download_stats) do
+    [
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 1)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 1)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 1)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 15)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 30))
+    ]
+  end
   let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') }
   let(:csv_data) { CSV.read(output_path, headers: true) }
   let(:output_records) { csv_data.map { |row| row.to_h.symbolize_keys } }
@@ -16,13 +28,13 @@
 
   describe '#list_record_info' do
     # Helper method to convert an array of FileDownloadStat objects to an array of hashes
+    # Checks for truncated date to the beginning of the month
     def expected_records_for(stats)
       stats.map do |stat|
         {
-          id: stat.id.to_s,
-          date: stat.date.to_s,
+          file_id: stat.file_id,
+          date: stat.date.beginning_of_month.to_s,
           downloads: stat.downloads.to_s,
-          file_id: stat.file_id
         }
       end
     end

From cd78526a3eb0763964879a6f93fc443575787619 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Thu, 22 Aug 2024 17:01:36 -0400
Subject: [PATCH 08/32] refactor functions in download analytics behavior into
 a helper module

---
 .../hyc/download_analytics_behavior.rb        |  45 ++-----
 app/helpers/work_utils_helper.rb              |  19 +++
 .../hyrax/downloads_controller_spec.rb        | 125 ++++++++++--------
 3 files changed, 98 insertions(+), 91 deletions(-)
 create mode 100644 app/helpers/work_utils_helper.rb

diff --git a/app/controllers/concerns/hyc/download_analytics_behavior.rb b/app/controllers/concerns/hyc/download_analytics_behavior.rb
index cf144f3e4..0451751fa 100644
--- a/app/controllers/concerns/hyc/download_analytics_behavior.rb
+++ b/app/controllers/concerns/hyc/download_analytics_behavior.rb
@@ -41,8 +41,8 @@ def track_download
             send_image: '0',
             ua: user_agent,
             # Recovering work id with a solr query
-            dimension1: record_id,
-            dimension2: record_title
+            dimension1: work_data[:work_id],
+            dimension2: work_data[:title]
           }
           uri.query = URI.encode_www_form(uri_params)
           response = HTTParty.get(uri.to_s)
@@ -58,19 +58,16 @@ def track_download
       end
 
       def create_download_stat
-        record_id_value = record_id
-        work_type_value = work_type
-        admin_set_id_value = admin_set_id
         date = Date.today
 
         Rails.logger.debug('Creating or updating hyc-download-stat database entry with the following attributes:')
-        Rails.logger.debug("fileset_id: #{fileset_id}, work_id: #{record_id_value}, admin_set_id: #{admin_set_id_value}, work_type: #{work_type_value}, date: #{date.beginning_of_month}")
+        Rails.logger.debug("fileset_id: #{fileset_id}, work_id: #{work_data[:work_id]}, admin_set_id: #{work_data[:admin_set_id]}, work_type: #{work_data[:work_type]}, date: #{date.beginning_of_month}")
 
         stat = HycDownloadStat.find_or_initialize_by(
           fileset_id: fileset_id,
-          work_id: record_id_value,
-          admin_set_id: admin_set_id_value,
-          work_type: work_type_value,
+          work_id: work_data[:work_id],
+          admin_set_id: work_data[:admin_set_id],
+          work_type: work_data[:work_type],
           date: date.beginning_of_month
         )
         stat.download_count += 1
@@ -87,38 +84,14 @@ def bot_request?(user_agent)
         browser.bot?
       end
 
-      def fetch_record
-        @record ||= ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs']
-      end
-
-      def fetch_admin_set
-        @admin_set ||= ActiveFedora::SolrService.get("title_tesim:#{@admin_set_name}", rows: 1)['response']['docs']
-      end
-
-      def admin_set_id
-        @admin_set_id ||= fetch_admin_set.dig(0, 'id') || 'Unknown'
-      end
-
-      def record_id
-        @record_id ||= fetch_record.dig(0, 'id') || 'Unknown'
-      end
-
-      def work_type
-        @work_type ||= fetch_record.dig(0, 'has_model_ssim', 0) || 'Unknown'
-      end
-
       def fileset_id
         @fileset_id ||= params[:id] || 'Unknown'
       end
 
-      def record_title
-        @record_title ||= if !fetch_record.blank? && fetch_record[0]['title_tesim']
-                            fetch_record[0]['title_tesim'].first
-                          else
-                            'Unknown'
-                          end
+      def work_data
+        @work_data ||= WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id)
       end
-
+      
       def site_id
         @site_id ||= ENV['MATOMO_SITE_ID']
       end
diff --git a/app/helpers/work_utils_helper.rb b/app/helpers/work_utils_helper.rb
new file mode 100644
index 000000000..84f23fdf2
--- /dev/null
+++ b/app/helpers/work_utils_helper.rb
@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+module WorkUtilsHelper
+    def self.fetch_work_data_by_fileset_id(fileset_id)
+      work = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {}
+      raise "No work found for fileset id: #{fileset_id}" if work.blank?
+      # Fetch the admin set related to the work
+      admin_set_name = work['admin_set_tesim']&.first || 'Unknown'
+      admin_set = ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", rows: 1)['response']['docs'].first || {}
+
+      {
+        work_id: work['id'] || 'Unknown',
+        work_type: work.dig('has_model_ssim', 0) || 'Unknown',
+        title: work['title_tesim']&.first || 'Unknown',
+        admin_set_id: admin_set['id'] || 'Unknown',
+        admin_set_name: admin_set_name
+      }
+    end
+end
+  
\ No newline at end of file
diff --git a/spec/controllers/hyrax/downloads_controller_spec.rb b/spec/controllers/hyrax/downloads_controller_spec.rb
index e80aa4c37..137a292b7 100644
--- a/spec/controllers/hyrax/downloads_controller_spec.rb
+++ b/spec/controllers/hyrax/downloads_controller_spec.rb
@@ -29,6 +29,13 @@
     'admin_set_tesim' => ['Open_Access_Articles_and_Book_Chapters']}
   ]
   }
+  let(:mock_work_data) { {
+     work_id: '1z40m031g',
+     work_type: 'Article',
+     title: ['Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response'],
+     admin_set_id: 'h128zk07m',
+     admin_set_name: 'Open_Access_Articles_and_Book_Chapters'
+  } }
   let(:file_set) do
     FactoryBot.create(:file_with_work, user: user, content: File.open("#{fixture_path}/files/image.png"))
   end
@@ -53,8 +60,7 @@
     allow(stub_matomo)
     @user = user
     sign_in @user
-    allow(controller).to receive(:fetch_record).and_return(mock_record)
-    allow(controller).to receive(:fetch_admin_set).and_return(mock_admin_set)
+    allow(WorkUtilsHelper).to receive(:fetch_work_data_by_fileset_id).and_return(mock_work_data)
     allow(Hyrax::Analytics.config).to receive(:site_id).and_return(spec_site_id)
     allow(SecureRandom).to receive(:uuid).and_return('555')
     allow(Hyrax::VirusCheckerService).to receive(:file_has_virus?) { false }
@@ -201,8 +207,14 @@
 
     context 'fileset without a parent work' do
       before do
-        allow(controller).to receive(:fetch_record).and_return([{}])
-        allow(controller).to receive(:fetch_admin_set).and_return([{}])
+        dummy_work_data = {
+          work_id: 'Unknown',
+          work_type: 'Unknown',
+          title: 'Unknown',
+          admin_set_id: 'Unknown',
+          admin_set_name: 'Unknown'
+        }
+        allow(WorkUtilsHelper).to receive(:fetch_work_data_by_fileset_id).and_return(dummy_work_data)
       end
 
       it 'records a download event with no work type' do
@@ -332,61 +344,64 @@
     end
   end
 
-  describe '#fetch_record' do
-    it 'fetches the record from Solr' do
-      expect(controller.send(:fetch_record)).to eq(mock_record)
-    end
-  end
-
-  describe '#fetch_admin_set' do
-    it 'fetches the admin set from Solr' do
-      expect(controller.send(:fetch_admin_set)).to eq(mock_admin_set)
-    end
-  end
-
-  describe '#admin_set_id' do
-    it 'returns the admin set id' do
-      expect(controller.send(:admin_set_id)).to eq('h128zk07m')
-    end
-  end
-
-  describe '#record_id' do
-    it 'returns the record id' do
-      expect(controller.send(:record_id)).to eq('1z40m031g')
-    end
-
-    it 'returns Unknown if the record is blank' do
-      allow(controller).to receive(:fetch_record).and_return([])
-      expect(controller.send(:record_id)).to eq('Unknown')
-    end
-  end
-
-  describe '#fileset_id' do
-    it 'returns the fileset id from params' do
-      controller.params = { id: file_set.id }
-      expect(controller.send(:fileset_id)).to eq(file_set.id)
-    end
-
-    it 'returns Unknown if params id is missing' do
-      controller.params = {}
-      expect(controller.send(:fileset_id)).to eq('Unknown')
-    end
-  end
-
-  describe '#record_title' do
-    it 'returns the record title' do
-      expect(controller.send(:record_title)).to eq('Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response')
-    end
-
-    it 'returns Unknown if the record title is blank' do
-      allow(controller).to receive(:fetch_record).and_return([{ 'title_tesim' => nil }])
-      expect(controller.send(:record_title)).to eq('Unknown')
-    end
-  end
+  # describe '#fetch_record' do
+  #   it 'fetches the record from Solr' do
+  #     expect(controller.send(:fetch_record)).to eq(mock_record)
+  #   end
+  # end
+
+  # describe '#fetch_admin_set' do
+  #   it 'fetches the admin set from Solr' do
+  #     expect(controller.send(:fetch_admin_set)).to eq(mock_admin_set)
+  #   end
+  # end
+
+  # describe '#admin_set_id' do
+  #   it 'returns the admin set id' do
+  #     expect(controller.send(:admin_set_id)).to eq('h128zk07m')
+  #   end
+  # end
+
+  # describe '#record_id' do
+  #   it 'returns the record id' do
+  #     expect(controller.send(:record_id)).to eq('1z40m031g')
+  #   end
+
+  #   it 'returns Unknown if the record is blank' do
+  #     # allow(controller).to receive(:fetch_record).and_return([])
+  #     expect(controller.send(:record_id)).to eq('Unknown')
+  #   end
+  # end
+
+  # describe '#fileset_id' do
+  #   it 'returns the fileset id from params' do
+  #     controller.params = { id: file_set.id }
+  #     expect(controller.send(:fileset_id)).to eq(file_set.id)
+  #   end
+
+  #   it 'returns Unknown if params id is missing' do
+  #     controller.params = {}
+  #     expect(controller.send(:fileset_id)).to eq('Unknown')
+  #   end
+  # end
+
+  # describe '#record_title' do
+  #   it 'returns the record title' do
+  #     expect(controller.send(:record_title)).to eq('Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response')
+  #   end
+
+  #   it 'returns Unknown if the record title is blank' do
+  #     # allow(controller).to receive(:fetch_record).and_return([{ 'title_tesim' => nil }])
+  #     expect(controller.send(:record_title)).to eq('Unknown')
+  #   end
+  # end
 
   describe '#site_id' do
     it 'returns the site id from ENV' do
       expect(controller.send(:site_id)).to eq('5')
     end
+    # it 'passes this simple test' do
+    #   expect(true).to eq(true)
+    # end
   end
 end

From 345b910f49e09df30f3f8d948514b9f52e35188a Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Thu, 22 Aug 2024 17:34:47 -0400
Subject: [PATCH 09/32] linting, tests for utility helper and removing comments

---
 .../hyc/download_analytics_behavior.rb        |  2 +-
 app/helpers/work_utils_helper.rb              | 29 ++++---
 .../hyrax/downloads_controller_spec.rb        | 58 +-------------
 spec/helpers/hyrax/work_utils_helper_spec.rb  | 79 +++++++++++++++++++
 4 files changed, 96 insertions(+), 72 deletions(-)
 create mode 100644 spec/helpers/hyrax/work_utils_helper_spec.rb

diff --git a/app/controllers/concerns/hyc/download_analytics_behavior.rb b/app/controllers/concerns/hyc/download_analytics_behavior.rb
index 0451751fa..f2c555b97 100644
--- a/app/controllers/concerns/hyc/download_analytics_behavior.rb
+++ b/app/controllers/concerns/hyc/download_analytics_behavior.rb
@@ -91,7 +91,7 @@ def fileset_id
       def work_data
         @work_data ||= WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id)
       end
-      
+
       def site_id
         @site_id ||= ENV['MATOMO_SITE_ID']
       end
diff --git a/app/helpers/work_utils_helper.rb b/app/helpers/work_utils_helper.rb
index 84f23fdf2..a92119f63 100644
--- a/app/helpers/work_utils_helper.rb
+++ b/app/helpers/work_utils_helper.rb
@@ -1,19 +1,18 @@
 # frozen_string_literal: true
 module WorkUtilsHelper
-    def self.fetch_work_data_by_fileset_id(fileset_id)
-      work = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {}
-      raise "No work found for fileset id: #{fileset_id}" if work.blank?
-      # Fetch the admin set related to the work
-      admin_set_name = work['admin_set_tesim']&.first || 'Unknown'
-      admin_set = ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", rows: 1)['response']['docs'].first || {}
+  def self.fetch_work_data_by_fileset_id(fileset_id)
+    work = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {}
+    raise "No work found for fileset id: #{fileset_id}" if work.blank?
+    # Fetch the admin set related to the work
+    admin_set_name = work['admin_set_tesim']&.first || 'Unknown'
+    admin_set = ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", rows: 1)['response']['docs'].first || {}
 
-      {
-        work_id: work['id'] || 'Unknown',
-        work_type: work.dig('has_model_ssim', 0) || 'Unknown',
-        title: work['title_tesim']&.first || 'Unknown',
-        admin_set_id: admin_set['id'] || 'Unknown',
-        admin_set_name: admin_set_name
-      }
-    end
+    {
+      work_id: work['id'] || 'Unknown',
+      work_type: work.dig('has_model_ssim', 0) || 'Unknown',
+      title: work['title_tesim']&.first || 'Unknown',
+      admin_set_id: admin_set['id'] || 'Unknown',
+      admin_set_name: admin_set_name
+    }
+  end
 end
-  
\ No newline at end of file
diff --git a/spec/controllers/hyrax/downloads_controller_spec.rb b/spec/controllers/hyrax/downloads_controller_spec.rb
index 137a292b7..7720af709 100644
--- a/spec/controllers/hyrax/downloads_controller_spec.rb
+++ b/spec/controllers/hyrax/downloads_controller_spec.rb
@@ -35,7 +35,8 @@
      title: ['Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response'],
      admin_set_id: 'h128zk07m',
      admin_set_name: 'Open_Access_Articles_and_Book_Chapters'
-  } }
+  }
+  }
   let(:file_set) do
     FactoryBot.create(:file_with_work, user: user, content: File.open("#{fixture_path}/files/image.png"))
   end
@@ -344,64 +345,9 @@
     end
   end
 
-  # describe '#fetch_record' do
-  #   it 'fetches the record from Solr' do
-  #     expect(controller.send(:fetch_record)).to eq(mock_record)
-  #   end
-  # end
-
-  # describe '#fetch_admin_set' do
-  #   it 'fetches the admin set from Solr' do
-  #     expect(controller.send(:fetch_admin_set)).to eq(mock_admin_set)
-  #   end
-  # end
-
-  # describe '#admin_set_id' do
-  #   it 'returns the admin set id' do
-  #     expect(controller.send(:admin_set_id)).to eq('h128zk07m')
-  #   end
-  # end
-
-  # describe '#record_id' do
-  #   it 'returns the record id' do
-  #     expect(controller.send(:record_id)).to eq('1z40m031g')
-  #   end
-
-  #   it 'returns Unknown if the record is blank' do
-  #     # allow(controller).to receive(:fetch_record).and_return([])
-  #     expect(controller.send(:record_id)).to eq('Unknown')
-  #   end
-  # end
-
-  # describe '#fileset_id' do
-  #   it 'returns the fileset id from params' do
-  #     controller.params = { id: file_set.id }
-  #     expect(controller.send(:fileset_id)).to eq(file_set.id)
-  #   end
-
-  #   it 'returns Unknown if params id is missing' do
-  #     controller.params = {}
-  #     expect(controller.send(:fileset_id)).to eq('Unknown')
-  #   end
-  # end
-
-  # describe '#record_title' do
-  #   it 'returns the record title' do
-  #     expect(controller.send(:record_title)).to eq('Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response')
-  #   end
-
-  #   it 'returns Unknown if the record title is blank' do
-  #     # allow(controller).to receive(:fetch_record).and_return([{ 'title_tesim' => nil }])
-  #     expect(controller.send(:record_title)).to eq('Unknown')
-  #   end
-  # end
-
   describe '#site_id' do
     it 'returns the site id from ENV' do
       expect(controller.send(:site_id)).to eq('5')
     end
-    # it 'passes this simple test' do
-    #   expect(true).to eq(true)
-    # end
   end
 end
diff --git a/spec/helpers/hyrax/work_utils_helper_spec.rb b/spec/helpers/hyrax/work_utils_helper_spec.rb
new file mode 100644
index 000000000..f2e9c62d8
--- /dev/null
+++ b/spec/helpers/hyrax/work_utils_helper_spec.rb
@@ -0,0 +1,79 @@
+# frozen_string_literal: true
+require 'rails_helper'
+require Rails.root.join('app/overrides/controllers/hydra/controller/download_behavior_override.rb')
+require Rails.root.join('app/overrides/controllers/hyrax/downloads_controller_override.rb')
+
+RSpec.describe WorkUtilsHelper, type: :module do
+  let(:fileset_id) { 'file-set-id' }
+  let(:admin_set_name) { 'Open_Access_Articles_and_Book_Chapters' }
+  let(:example_admin_set_id) { 'h128zk07m' }
+  let(:example_work_id) { '1z40m031g' }
+
+  let(:mock_record) { [{
+    'has_model_ssim' => ['Article'],
+    'id' =>  '1z40m031g',
+    'title_tesim' => ['Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response'],
+    'admin_set_tesim' => ['Open_Access_Articles_and_Book_Chapters']}
+  ]
+  }
+
+  let(:mock_admin_set) { [{
+    'has_model_ssim' => ['AdminSet'],
+    'id' => 'h128zk07m',
+    'title_tesim' => ['Open_Access_Articles_and_Book_Chapters']}
+  ]
+  }
+
+  let(:expected_work_data) { {
+     work_id: '1z40m031g',
+     work_type: 'Article',
+     title: 'Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response',
+     admin_set_id: 'h128zk07m',
+     admin_set_name: 'Open_Access_Articles_and_Book_Chapters'
+  }
+  }
+
+  before do
+    allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_id}", rows: 1).and_return('response' => { 'docs' => mock_record })
+    allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_name}", rows: 1).and_return('response' => { 'docs' => mock_admin_set })
+  end
+
+  describe '#fetch_work_data_by_fileset_id' do
+    it 'fetches the work data correctly' do
+      result = WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id)
+      expect(result).to eq(expected_work_data)
+    end
+
+    it 'properly substitutes Unknown for missing values' do
+        # Mock the solr response to simulate a work with missing values, if it somehow makes it past the initial nil check
+      allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_id}", rows: 1).and_return('response' => { 'docs' => [{ 'placeholder-key' =>  'placeholder-value' }] })
+      allow(ActiveFedora::SolrService).to receive(:get).with('title_tesim:Unknown', rows: 1).and_return('response' => { 'docs' => [] })
+      result = WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id)
+      expect(result[:work_id]).to eq('Unknown')
+      expect(result[:work_type]).to eq('Unknown')
+      expect(result[:title]).to eq('Unknown')
+      expect(result[:admin_set_id]).to eq('Unknown')
+    end
+
+    context 'when no work is found' do
+      before do
+        allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_id}", rows: 1).and_return('response' => { 'docs' => [] })
+      end
+
+      it 'raises an error if no work is found' do
+        expect { WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id) }.to raise_error(RuntimeError, "No work found for fileset id: #{fileset_id}")
+      end
+    end
+
+    context 'when admin set is not found' do
+      before do
+        allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_name}", rows: 1).and_return('response' => { 'docs' => [] })
+      end
+
+      it 'sets the admin_set_id to Unknown if admin set is not found' do
+        result = WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id)
+        expect(result[:admin_set_id]).to eq('Unknown')
+      end
+    end
+  end
+end

From 7d5572d38c25e7bad0da5688894747c62773d63b Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Fri, 23 Aug 2024 11:57:42 -0400
Subject: [PATCH 10/32] factory for solr query results, adding functionality to
 migration service

---
 .../tasks/download_stats_migration_service.rb | 16 +++++++--
 spec/factories/solr_query_result.rb           | 36 +++++++++++++++++++
 .../download_stats_migration_service_spec.rb  | 17 ++++++++-
 3 files changed, 65 insertions(+), 4 deletions(-)
 create mode 100644 spec/factories/solr_query_result.rb

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index 08ed9660d..c3a74a237 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -31,20 +31,30 @@ def migrate_to_new_table(csv_path)
 
       # Recreate or update objects in new table
       records.each do |stat|
-        update_or_create_stat(stat)
+        create_hyc_download_stat(stat)
       end
     end
 
-    def update_or_create_stat(stat)
+    def create_hyc_download_stat(stat)
       puts "Inspect stat: #{stat.inspect}"
       puts "Stat file_id: #{stat[:file_id]}"
       hyc_download_stat = HycDownloadStat.find_or_initialize_by(fileset_id: stat[:file_id].to_s)
+      work_data = work_data_from_stat(stat)
       hyc_download_stat.assign_attributes(
+        fileset_id: stat[:file_id],
+        work_id: work_data[:work_id],
+        admin_set_id: work_data[:admin_set_id],
+        work_type: work_data[:work_type],
         date: stat[:date],
         download_count: stat[:downloads],
-        fileset_id: stat[:file_id]
       )
       hyc_download_stat.save
     end
+
+    # Similar implementation to work_data in DownloadAnalyticsBehavior
+    # Memoization is not necessary here since this method is called per stat
+    def work_data_from_stat(stat)
+      WorkUtilsHelper.fetch_work_data_by_fileset_id(stat[:file_id])
+    end
   end
 end
diff --git a/spec/factories/solr_query_result.rb b/spec/factories/solr_query_result.rb
new file mode 100644
index 000000000..e17445ceb
--- /dev/null
+++ b/spec/factories/solr_query_result.rb
@@ -0,0 +1,36 @@
+# spec/factories/solr_query_result.rb
+# frozen_string_literal: true
+
+# Helper method to generate work or admin IDs like '0001abc', '0002abc', etc.
+def generate_id(n)
+  "#{n.to_s.rjust(4, '0')}abc"
+end
+
+# Factory for creating Solr query results
+FactoryBot.define do
+  factory :solr_query_result, class: OpenStruct do
+
+    # General attributes that could be shared between works and admin sets
+    # sequence(:admin_set_id) { |n| "admin_set_id_#{n}" }
+    # sequence(:admin_set_name) { |n| "Admin Set #{n}" }
+
+    trait :work do
+      # Default values for has_model_ssim, admin_set_tesim, and file_set_ids_ssim
+      has_model_ssim { ['Article'] }
+      admin_set_tesim { ['Open_Access_Articles_and_Book_Chapters'] }
+      file_set_ids_ssim { ['file_set_id'] }
+      sequence(:id) { |n| generate_id(n) }
+      sequence(:title_tesim) { |n| ["Test Title #{n}"] }
+    end
+
+    trait :admin_set do
+      # Default values for has_model_ssim and title_tesim
+      has_model_ssim { ['AdminSet'] }
+      title_tesim { ['Open_Access_Articles_and_Book_Chapters'] }
+      sequence(:id) { |n| generate_id(n) }
+    end
+
+    # Override the default save behavior to do nothing since it's a non-ActiveRecord object
+    to_create { |instance| instance }
+  end
+end
diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb
index f3cc23f55..81b1730a2 100644
--- a/spec/services/tasks/download_stats_migration_service_spec.rb
+++ b/spec/services/tasks/download_stats_migration_service_spec.rb
@@ -15,6 +15,10 @@
         FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 30))
     ]
   end
+  let(:admin_set_title) { 'Open_Access_Articles_and_Book_Chapters' }
+  let!(:mock_admin_set) { FactoryBot.create(:solr_query_result) }
+  # Generate works for the each file_download_stat
+  let!(:mock_works) { file_download_stats.map { |stat| FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) } }
   let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') }
   let(:csv_data) { CSV.read(output_path, headers: true) }
   let(:output_records) { csv_data.map { |row| row.to_h.symbolize_keys } }
@@ -24,6 +28,12 @@
   before do
     # Ensure the output file is removed before each test
     File.delete(output_path) if File.exist?(output_path)
+    allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_title}", rows: 1).and_return('response' => { 'docs' => [mock_admin_set] })
+    file_download_stats.each_with_index do |stat, index|
+      # Assign a random number of downloads to each stat
+      puts "Stat file_id: #{stat.file_id}"
+      allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] })
+    end
   end
 
   describe '#list_record_info' do
@@ -68,7 +78,12 @@ def expected_records_for(stats)
     end
 
     it 'creates new HycDownloadStat records from the CSV file' do
-      output_records.each do |record|
+      output_records.each_with_index do |record,index|
+        # WIP: Break at index 5
+        if index == 5
+          break
+        end
+        puts "Record: #{record.inspect}"
         hyc_download_stat = HycDownloadStat.find(record[:id])
         expect(hyc_download_stat.date.to_s).to eq(record[:date])
         expect(hyc_download_stat.downloads).to eq(record[:downloads].to_i)

From b891812099eb8591e3771f54e9e94485ac27b83b Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Fri, 23 Aug 2024 13:29:11 -0400
Subject: [PATCH 11/32] helper functions for test class, handling truncation
 outside of query to get around group by errors

---
 .../tasks/download_stats_migration_service.rb |  54 ++++++--
 .../download_stats_migration_service_spec.rb  | 125 +++++++++++-------
 2 files changed, 124 insertions(+), 55 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index c3a74a237..9d32c4d36 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -4,22 +4,37 @@ class DownloadStatsMigrationService
     PAGE_SIZE = 1000
     def list_record_info(output_path, after_timestamp = nil)
 
-      # Build the query
-      query = FileDownloadStat.select("id, file_id, DATE_TRUNC('month', date) AS date, SUM(downloads) AS downloads")
+    #   # Build the query
+    #   query = FileDownloadStat.select("file_id, DATE_TRUNC('month', date) AS date, SUM(downloads) AS downloads")
+    #   query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
+
+    #   # Group by file_id and truncated month
+    #   query = query.group('file_id, DATE_TRUNC(\'month\', date)').order('file_id, date')
+      query = FileDownloadStat.all
       query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
 
-      # Group by file_id and truncated month
-      query = query.group('id, file_id, date, downloads').order('file_id, date')
+    #   # Fetch the IDs in batches for memory efficiency and performance
+    #   records = []
+    #   query.in_batches(of: PAGE_SIZE, load: true) do |relation|
+    #     records.concat(relation.map { |record| { file_id: record.file_id, date: record.date, downloads: record.downloads } })
+    #   end
 
-      # Fetch the IDs in batches for memory efficiency and performance
+    # Fetch the records in batches to handle large datasets
       records = []
-      query.in_batches(of: PAGE_SIZE, load: true) do |relation|
-        records.concat(relation.map { |record| { file_id: record.file_id, date: record.date, downloads: record.downloads } })
+      query.find_each(batch_size: PAGE_SIZE) do |record|
+        records << {
+          file_id: record.file_id,
+          date: record.date,
+          downloads: record.downloads
+        }
       end
 
+    # Perform aggregation in Ruby
+      aggregated_records = aggregate_downloads(records)
+
       # Write the records to the specified CSV file
       CSV.open(output_path, 'w', write_headers: true, headers: ['file_id', 'date', 'downloads']) do |csv|
-        records.each do |record|
+        aggregated_records.each do |record|
           csv << [record[:file_id], record[:date], record[:downloads]]
         end
       end
@@ -38,7 +53,10 @@ def migrate_to_new_table(csv_path)
     def create_hyc_download_stat(stat)
       puts "Inspect stat: #{stat.inspect}"
       puts "Stat file_id: #{stat[:file_id]}"
-      hyc_download_stat = HycDownloadStat.find_or_initialize_by(fileset_id: stat[:file_id].to_s)
+      hyc_download_stat = HycDownloadStat.find_or_initialize_by(
+        fileset_id: stat[:file_id].to_s,
+        date: stat[:date]
+    )
       work_data = work_data_from_stat(stat)
       hyc_download_stat.assign_attributes(
         fileset_id: stat[:file_id],
@@ -56,5 +74,23 @@ def create_hyc_download_stat(stat)
     def work_data_from_stat(stat)
       WorkUtilsHelper.fetch_work_data_by_fileset_id(stat[:file_id])
     end
+
+    def aggregate_downloads(records)
+      aggregated_data = {}
+
+      records.each do |record|
+        file_id = record[:file_id]
+        truncated_date = record[:date].beginning_of_month
+        downloads = record[:downloads]
+
+        # Group the file_id and truncated date to be used as a key
+        key = [file_id, truncated_date]
+        # Initialize the hash for the key if it doesn't exist
+        aggregated_data[key] ||= { file_id: file_id, date: truncated_date, downloads: 0 }
+        # Sum the downloads for each key
+        aggregated_data[key][:downloads] += downloads
+      end
+      aggregated_data.values
+    end
   end
 end
diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb
index 81b1730a2..8029c1bbd 100644
--- a/spec/services/tasks/download_stats_migration_service_spec.rb
+++ b/spec/services/tasks/download_stats_migration_service_spec.rb
@@ -2,50 +2,35 @@
 require 'rails_helper'
 
 RSpec.describe Tasks::DownloadStatsMigrationService, type: :service do
-  let!(:file_download_stats) do
-    [
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 1)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 1)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 1)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 15)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 30))
-    ]
-  end
   let(:admin_set_title) { 'Open_Access_Articles_and_Book_Chapters' }
-  let!(:mock_admin_set) { FactoryBot.create(:solr_query_result) }
-  # Generate works for the each file_download_stat
-  let!(:mock_works) { file_download_stats.map { |stat| FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) } }
+  let(:mock_admin_set) { FactoryBot.create(:solr_query_result, :admin_set, title_tesim: [admin_set_title]) }
   let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') }
-  let(:csv_data) { CSV.read(output_path, headers: true) }
-  let(:output_records) { csv_data.map { |row| row.to_h.symbolize_keys } }
+  # let(:csv_data) { CSV.read(output_path, headers: true) }
+  # let(:output_records) { csv_data.map { |row| row.to_h.symbolize_keys } }
   let(:service) { described_class.new }
 
-
   before do
     # Ensure the output file is removed before each test
     File.delete(output_path) if File.exist?(output_path)
     allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_title}", rows: 1).and_return('response' => { 'docs' => [mock_admin_set] })
-    file_download_stats.each_with_index do |stat, index|
-      # Assign a random number of downloads to each stat
-      puts "Stat file_id: #{stat.file_id}"
-      allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] })
-    end
   end
 
   describe '#list_record_info' do
-    # Helper method to convert an array of FileDownloadStat objects to an array of hashes
-    # Checks for truncated date to the beginning of the month
-    def expected_records_for(stats)
-      stats.map do |stat|
-        {
-          file_id: stat.file_id,
-          date: stat.date.beginning_of_month.to_s,
-          downloads: stat.downloads.to_s,
-        }
+    before do
+      @file_download_stats = [
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 1)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 1)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 1)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 15)),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 30))
+      ]
+      @mock_works = @file_download_stats.map { |stat| FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) }
+      @file_download_stats.each_with_index do |stat, index|
+        allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [@mock_works[index]] })
       end
     end
 
@@ -53,7 +38,7 @@ def expected_records_for(stats)
       service.list_record_info(output_path, nil)
 
       expect(File).to exist(output_path)
-      expect(output_records).to match_array(expected_records_for(file_download_stats))
+      expect(csv_to_hash_array(output_path)).to match_array(expected_records_for(@file_download_stats))
     end
 
     context 'with an after_timestamp' do
@@ -65,30 +50,78 @@ def expected_records_for(stats)
         service.list_record_info(output_path, after_timestamp)
 
         expect(File).to exist(output_path)
-        expect(output_records).not_to include(expected_records_for([old_stat]).first)
-        expect(output_records).to match_array(expected_records_for(recent_stats))
+        expect(csv_to_hash_array(output_path)).not_to include(expected_records_for([old_stat]).first)
+        expect(csv_to_hash_array(output_path)).to match_array(expected_records_for(recent_stats))
       end
     end
   end
 
   describe '#migrate_to_new_table' do
     before do
+        # Smaller groups to enable easier testing for aggregation of download stats from daily to monthly
+      @file_download_stats = [[
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15), downloads: 10, file_id: 'file_id_1'),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30), downloads: 10, file_id: 'file_id_1'),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 15), downloads: 10, file_id: 'file_id_1'),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 30), downloads: 10, file_id: 'file_id_1'),
+      ],
+      [
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 15), downloads: 10, file_id: 'file_id_2'),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 30), downloads: 10, file_id: 'file_id_2'),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 15), downloads: 10, file_id: 'file_id_2'),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 30), downloads: 10, file_id: 'file_id_2'),
+      ],
+      [
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15), downloads: 10, file_id: 'file_id_3'),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30), downloads: 10, file_id: 'file_id_3'),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 15), downloads: 10, file_id: 'file_id_3'),
+        FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 30), downloads: 10, file_id: 'file_id_3'),
+      ]]
+      @mock_works = @file_download_stats.flatten.map do |stat|
+        FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id])
+      end
+      @file_download_stats.flatten.each_with_index do |stat, index|
+        allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [@mock_works[index]] })
+      end
       service.list_record_info(output_path, nil)
       service.migrate_to_new_table(output_path)
     end
 
     it 'creates new HycDownloadStat records from the CSV file' do
-      output_records.each_with_index do |record,index|
-        # WIP: Break at index 5
-        if index == 5
-          break
-        end
-        puts "Record: #{record.inspect}"
-        hyc_download_stat = HycDownloadStat.find(record[:id])
-        expect(hyc_download_stat.date.to_s).to eq(record[:date])
-        expect(hyc_download_stat.downloads).to eq(record[:downloads].to_i)
-        expect(hyc_download_stat.file_id).to eq(record[:file_id])
+      csv_to_hash_array(output_path).each_with_index do |record, index|
+        # puts 'All HycDownloadStat records:'
+        # HycDownloadStat.all.each do |stat|
+        #   puts "ID: #{stat.id}, File ID: #{stat.fileset_id}, Date: #{stat.date}, Downloads: #{stat.download_count}"
+        # end
+        # puts "Record: #{record.inspect}"
+        # hyc_download_stat = HycDownloadStat.find(record[:id])
+        work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(record[:file_id])
+        hyc_download_stat = HycDownloadStat.find_by(fileset_id: record[:file_id], date: record[:date].to_date.beginning_of_month)
+        
+        expect(hyc_download_stat).to be_present
+        expect(hyc_download_stat.fileset_id).to eq(record[:file_id])
+        expect(hyc_download_stat.work_id).to eq(work_data[:work_id])
+        expect(hyc_download_stat.date).to eq(record[:date].to_date)
+        # Each mocked record has 10 downloads per month, so the download count should be 20
+        expect(hyc_download_stat.download_count).to eq(20)
       end
     end
   end
+
+  private
+  def csv_to_hash_array(file_path)
+    CSV.read(file_path, headers: true).map { |row| row.to_h.symbolize_keys }
+  end
+
+  # Helper method to convert an array of FileDownloadStat objects to an array of hashes
+  # Checks for truncated date to the beginning of the month
+  def expected_records_for(stats)
+    stats.map do |stat|
+      {
+        file_id: stat.file_id,
+        date: stat.date.beginning_of_month.to_s,
+        downloads: stat.downloads.to_s,
+      }
+    end
+  end
 end

From 4f62fb624573e2e9ebe9972c6e5fed74a574b43e Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Fri, 23 Aug 2024 14:42:57 -0400
Subject: [PATCH 12/32] cleaning up test file

---
 .../download_stats_migration_service_spec.rb  | 125 +++++++++---------
 1 file changed, 66 insertions(+), 59 deletions(-)

diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb
index 8029c1bbd..abe9a4791 100644
--- a/spec/services/tasks/download_stats_migration_service_spec.rb
+++ b/spec/services/tasks/download_stats_migration_service_spec.rb
@@ -5,83 +5,96 @@
   let(:admin_set_title) { 'Open_Access_Articles_and_Book_Chapters' }
   let(:mock_admin_set) { FactoryBot.create(:solr_query_result, :admin_set, title_tesim: [admin_set_title]) }
   let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') }
-  # let(:csv_data) { CSV.read(output_path, headers: true) }
-  # let(:output_records) { csv_data.map { |row| row.to_h.symbolize_keys } }
   let(:service) { described_class.new }
 
   before do
-    # Ensure the output file is removed before each test
-    File.delete(output_path) if File.exist?(output_path)
     allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_title}", rows: 1).and_return('response' => { 'docs' => [mock_admin_set] })
   end
 
-  describe '#list_record_info' do
-    before do
-      @file_download_stats = [
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 1)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 1)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 1)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 15)),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 30))
-      ]
-      @mock_works = @file_download_stats.map { |stat| FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) }
-      @file_download_stats.each_with_index do |stat, index|
-        allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [@mock_works[index]] })
-      end
+  after do
+    # Ensure the output file is removed after each test
+    File.delete(output_path) if File.exist?(output_path)
+  end
+
+  # Smaller groups to allow for easier testing for aggregation of download stats from daily to monthly
+  let(:file_download_stats) { [[
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15), downloads: 10, file_id: 'file_id_1'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30), downloads: 10, file_id: 'file_id_1'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 15), downloads: 10, file_id: 'file_id_1'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 30), downloads: 10, file_id: 'file_id_1'),
+ ],
+ [
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 15), downloads: 10, file_id: 'file_id_2'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 30), downloads: 10, file_id: 'file_id_2'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 15), downloads: 10, file_id: 'file_id_2'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 30), downloads: 10, file_id: 'file_id_2'),
+ ],
+ [
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15), downloads: 10, file_id: 'file_id_3'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30), downloads: 10, file_id: 'file_id_3'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 15), downloads: 10, file_id: 'file_id_3'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 30), downloads: 10, file_id: 'file_id_3'),
+ ]]
+  }
+
+  let(:mock_works) do
+    file_download_stats.flatten.map do |stat|
+      FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id])
     end
+  end
 
+  describe '#list_record_info' do
     it 'writes all records to the output CSV file' do
+      file_download_stats.flatten.each_with_index do |stat, index|
+        allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] })
+      end
+
+      expected_records = [
+        { file_id: 'file_id_1', date: '2023-01-01 00:00:00 UTC', downloads: '20' },
+        { file_id: 'file_id_1', date: '2023-03-01 00:00:00 UTC', downloads: '20' },
+        { file_id: 'file_id_2', date: '2023-04-01 00:00:00 UTC', downloads: '20' },
+        { file_id: 'file_id_2', date: '2023-05-01 00:00:00 UTC', downloads: '20' },
+        { file_id: 'file_id_3', date: '2023-06-01 00:00:00 UTC', downloads: '20' },
+        { file_id: 'file_id_3', date: '2023-07-01 00:00:00 UTC', downloads: '20' }
+      ]
       service.list_record_info(output_path, nil)
 
       expect(File).to exist(output_path)
-      expect(csv_to_hash_array(output_path)).to match_array(expected_records_for(@file_download_stats))
+      expect(csv_to_hash_array(output_path)).to match_array(expected_records)
     end
 
     context 'with an after_timestamp' do
-      let(:after_timestamp) { 1.day.ago }
-      let!(:recent_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: 1.hour.ago) }
-      let!(:old_stat) { FactoryBot.create(:file_download_stat, updated_at: 2.days.ago) }
+      let(:recent_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: '2023-05-05 00:00:00 UTC') }
+      let(:old_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: '2023-04-05 00:00:00 UTC') }
+      let(:recent_stat_file_ids) { recent_stats.map(&:file_id) }
+      let(:old_stat_file_ids) { old_stats.map(&:file_id) }
+
+      before do
+        all_stats = recent_stats + old_stats
+        all_works = all_stats.map do |stat|
+          FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id])
+        end
+        all_stats.each_with_index do |stat, index|
+          allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [all_works[index]] })
+        end
+      end
 
       it 'filters records by the given timestamp' do
-        service.list_record_info(output_path, after_timestamp)
+        # Retrieve records created after 'updated_at' date for old stats
+        service.list_record_info(output_path, '2023-04-06 00:00:00 UTC')
+        puts "CSV data: #{csv_to_hash_array(output_path).inspect}"
 
         expect(File).to exist(output_path)
-        expect(csv_to_hash_array(output_path)).not_to include(expected_records_for([old_stat]).first)
-        expect(csv_to_hash_array(output_path)).to match_array(expected_records_for(recent_stats))
+        expect(csv_to_hash_array(output_path).map { |record| record[:file_id] }).to match_array(recent_stat_file_ids)
+        expect(csv_to_hash_array(output_path).map { |record| record[:file_id] }).not_to include(*old_stat_file_ids)
       end
     end
   end
 
   describe '#migrate_to_new_table' do
     before do
-        # Smaller groups to enable easier testing for aggregation of download stats from daily to monthly
-      @file_download_stats = [[
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15), downloads: 10, file_id: 'file_id_1'),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30), downloads: 10, file_id: 'file_id_1'),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 15), downloads: 10, file_id: 'file_id_1'),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 30), downloads: 10, file_id: 'file_id_1'),
-      ],
-      [
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 15), downloads: 10, file_id: 'file_id_2'),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 30), downloads: 10, file_id: 'file_id_2'),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 15), downloads: 10, file_id: 'file_id_2'),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 30), downloads: 10, file_id: 'file_id_2'),
-      ],
-      [
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15), downloads: 10, file_id: 'file_id_3'),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30), downloads: 10, file_id: 'file_id_3'),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 15), downloads: 10, file_id: 'file_id_3'),
-        FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 30), downloads: 10, file_id: 'file_id_3'),
-      ]]
-      @mock_works = @file_download_stats.flatten.map do |stat|
-        FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id])
-      end
-      @file_download_stats.flatten.each_with_index do |stat, index|
-        allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [@mock_works[index]] })
+      file_download_stats.flatten.each_with_index do |stat, index|
+        allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] })
       end
       service.list_record_info(output_path, nil)
       service.migrate_to_new_table(output_path)
@@ -89,15 +102,9 @@
 
     it 'creates new HycDownloadStat records from the CSV file' do
       csv_to_hash_array(output_path).each_with_index do |record, index|
-        # puts 'All HycDownloadStat records:'
-        # HycDownloadStat.all.each do |stat|
-        #   puts "ID: #{stat.id}, File ID: #{stat.fileset_id}, Date: #{stat.date}, Downloads: #{stat.download_count}"
-        # end
-        # puts "Record: #{record.inspect}"
-        # hyc_download_stat = HycDownloadStat.find(record[:id])
         work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(record[:file_id])
         hyc_download_stat = HycDownloadStat.find_by(fileset_id: record[:file_id], date: record[:date].to_date.beginning_of_month)
-        
+
         expect(hyc_download_stat).to be_present
         expect(hyc_download_stat.fileset_id).to eq(record[:file_id])
         expect(hyc_download_stat.work_id).to eq(work_data[:work_id])

From 14325abbfd81691cc46ca624058bf326b4561e44 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Fri, 23 Aug 2024 14:57:02 -0400
Subject: [PATCH 13/32] add task to rake file

---
 .../tasks/download_stats_migration_service.rb | 17 +----------
 lib/tasks/migrate_download_stats.rake         | 29 +++++++++++++++++--
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index 9d32c4d36..ff46391b1 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -3,22 +3,9 @@ module Tasks
   class DownloadStatsMigrationService
     PAGE_SIZE = 1000
     def list_record_info(output_path, after_timestamp = nil)
-
-    #   # Build the query
-    #   query = FileDownloadStat.select("file_id, DATE_TRUNC('month', date) AS date, SUM(downloads) AS downloads")
-    #   query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
-
-    #   # Group by file_id and truncated month
-    #   query = query.group('file_id, DATE_TRUNC(\'month\', date)').order('file_id, date')
       query = FileDownloadStat.all
       query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
 
-    #   # Fetch the IDs in batches for memory efficiency and performance
-    #   records = []
-    #   query.in_batches(of: PAGE_SIZE, load: true) do |relation|
-    #     records.concat(relation.map { |record| { file_id: record.file_id, date: record.date, downloads: record.downloads } })
-    #   end
-
     # Fetch the records in batches to handle large datasets
       records = []
       query.find_each(batch_size: PAGE_SIZE) do |record|
@@ -29,7 +16,7 @@ def list_record_info(output_path, after_timestamp = nil)
         }
       end
 
-    # Perform aggregation in Ruby
+     # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries
       aggregated_records = aggregate_downloads(records)
 
       # Write the records to the specified CSV file
@@ -51,8 +38,6 @@ def migrate_to_new_table(csv_path)
     end
 
     def create_hyc_download_stat(stat)
-      puts "Inspect stat: #{stat.inspect}"
-      puts "Stat file_id: #{stat[:file_id]}"
       hyc_download_stat = HycDownloadStat.find_or_initialize_by(
         fileset_id: stat[:file_id].to_s,
         date: stat[:date]
diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake
index 09926a995..e899d732a 100644
--- a/lib/tasks/migrate_download_stats.rake
+++ b/lib/tasks/migrate_download_stats.rake
@@ -21,11 +21,34 @@ namespace :migrate_download_stats do
       puts 'Please provide a valid output directory with a .csv extension'
       exit 1
     end
+    
+    migration_service = Tasks::DownloadStatsMigrationService.new
+    old_stats_csv = migration_service.list_record_info(options[:output_dir], options[:after])
+    puts "Listing completed in #{Time.now - start_time}s"
+    puts "Stored id list to file: #{options[:output_dir]}"
+    exit 0
+  end
 
-    file_path = Tasks::DownloadStatsMigrationService.new.list_record_info(options[:output_dir], options[:after])
+  desc 'migrate download stats to new table'
+  task :migrate, [:csv_path] => :environment do |_t, _args|
+    start_time = Time.now
+    puts "[#{start_time.utc.iso8601}] Starting migration from CSV to new table"
+    options = {}
 
-    puts "Listing completed in #{Time.now - start_time}s"
-    puts "Stored id list to file: #{file_path}"
+    opts = OptionParser.new
+    opts.banner = 'Usage: bundle exec rake migrate_download_stats:migrate -- [options]'
+    opts.on('-c', '--csv-path ARG', String, 'Path to the CSV file to migrate') { |val| options[:csv_path] = val }
+    args = opts.order!(ARGV) {}
+    opts.parse!(args)
+
+    unless options[:csv_path].present? && File.exist?(options[:csv_path])
+      puts 'Please provide a valid CSV file path'
+      exit 1
+    end
+
+    migration_service = Tasks::DownloadStatsMigrationService.new
+    migration_service.migrate_to_new_table(options[:csv_path])
+    puts "Migration completed in #{Time.now - start_time}s"
     exit 0
   end
 end

From 22f809b991c60e50a4437ef88f94249c494605c2 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Fri, 23 Aug 2024 14:57:23 -0400
Subject: [PATCH 14/32] rubocop

---
 lib/tasks/migrate_download_stats.rake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake
index e899d732a..70ef4e51b 100644
--- a/lib/tasks/migrate_download_stats.rake
+++ b/lib/tasks/migrate_download_stats.rake
@@ -21,7 +21,7 @@ namespace :migrate_download_stats do
       puts 'Please provide a valid output directory with a .csv extension'
       exit 1
     end
-    
+
     migration_service = Tasks::DownloadStatsMigrationService.new
     old_stats_csv = migration_service.list_record_info(options[:output_dir], options[:after])
     puts "Listing completed in #{Time.now - start_time}s"

From b738beb7f24a7cacc3e18db11d54eacedea7e5be Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Fri, 23 Aug 2024 15:16:57 -0400
Subject: [PATCH 15/32] error message change

---
 lib/tasks/migrate_download_stats.rake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake
index 70ef4e51b..2653fb5d7 100644
--- a/lib/tasks/migrate_download_stats.rake
+++ b/lib/tasks/migrate_download_stats.rake
@@ -18,7 +18,7 @@ namespace :migrate_download_stats do
     opts.parse!(args)
 
     unless options[:output_dir].present? && options[:output_dir][-4] == '.csv'
-      puts 'Please provide a valid output directory with a .csv extension'
+      puts 'Please provide a valid output directory with a .csv extension. Got ' + options[:output_dir].to_s
       exit 1
     end
 

From 4f995e431f96a291f7be9c87d7751448f1d7e7c2 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Fri, 23 Aug 2024 15:31:26 -0400
Subject: [PATCH 16/32] change csv check condition

---
 lib/tasks/migrate_download_stats.rake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake
index 2653fb5d7..b20b5242d 100644
--- a/lib/tasks/migrate_download_stats.rake
+++ b/lib/tasks/migrate_download_stats.rake
@@ -17,7 +17,7 @@ namespace :migrate_download_stats do
     args = opts.order!(ARGV) {}
     opts.parse!(args)
 
-    unless options[:output_dir].present? && options[:output_dir][-4] == '.csv'
+    unless options[:output_dir].present? && options[:output_dir].end_with?('.csv')
       puts 'Please provide a valid output directory with a .csv extension. Got ' + options[:output_dir].to_s
       exit 1
     end

From 71cf41747d563c31b99b2aaa7adb157a621d7a62 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Fri, 23 Aug 2024 16:03:27 -0400
Subject: [PATCH 17/32] name change

---
 lib/tasks/migrate_download_stats.rake | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake
index b20b5242d..8d2a752f2 100644
--- a/lib/tasks/migrate_download_stats.rake
+++ b/lib/tasks/migrate_download_stats.rake
@@ -4,14 +4,14 @@ require 'optparse'
 require 'optparse/date'
 
 namespace :migrate_download_stats do
-  desc 'list object ids for download stats migration'
-  task :list_ids, [:output_dir, :after] => :environment do |_t, _args|
+  desc 'output object record data for download stats migration into a csv'
+  task :list_record_data, [:output_dir, :after] => :environment do |_t, _args|
     start_time = Time.now
-    puts "[#{start_time.utc.iso8601}] starting listing of ids"
+    puts "[#{start_time.utc.iso8601}] starting listing of record data"
     options = {}
 
     opts = OptionParser.new
-    opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_ids -- [options]'
+    opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_record_data -- [options]'
     opts.on('-o', '--output-dir ARG', String, 'Directory list will be saved to') { |val| options[:output_dir] = val }
     opts.on('-a', '--after ARG', String, 'List objects which have been updated after this timestamp') { |val| options[:after] = val }
     args = opts.order!(ARGV) {}

From 862a79300519665ee190baa28ce664185bdfedbe Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Mon, 26 Aug 2024 11:07:02 -0400
Subject: [PATCH 18/32] logging and wording change

---
 .../tasks/download_stats_migration_service.rb | 60 +++++++++++++------
 lib/tasks/migrate_download_stats.rake         |  2 +-
 .../download_stats_migration_service_spec.rb  | 38 ++++++------
 3 files changed, 61 insertions(+), 39 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index ff46391b1..f460fac7d 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -2,41 +2,63 @@
 module Tasks
   class DownloadStatsMigrationService
     PAGE_SIZE = 1000
-    def list_record_info(output_path, after_timestamp = nil)
+    def list_work_info(output_path, after_timestamp = nil)
       query = FileDownloadStat.all
       query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
+      total_works = query.count
+      timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp'
 
-    # Fetch the records in batches to handle large datasets
-      records = []
-      query.find_each(batch_size: PAGE_SIZE) do |record|
-        records << {
-          file_id: record.file_id,
-          date: record.date,
-          downloads: record.downloads
+     # Log number of works retrieved and timestamp clause
+      Rails.logger.info("Listing works #{timestamp_clause} to #{output_path}")
+      Rails.logger.info("#{total_works} works from the hyrax local cache.")
+
+      works = []
+      works_retrieved_from_query_count = 0
+
+      Rails.logger.info("Retrieving works in batches of #{PAGE_SIZE}")
+     # Fetch the works in batches
+      query.find_each(batch_size: PAGE_SIZE) do |work|
+        works << {
+          file_id: work.file_id,
+          date: work.date,
+          downloads: work.downloads
         }
+        works_retrieved_from_query_count += 1
+        log_progress(works_retrieved_from_query_count, total_works)
       end
 
      # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries
-      aggregated_records = aggregate_downloads(records)
+      aggregated_works = aggregate_downloads(works)
 
-      # Write the records to the specified CSV file
+      # Write the works to the specified CSV file
       CSV.open(output_path, 'w', write_headers: true, headers: ['file_id', 'date', 'downloads']) do |csv|
-        aggregated_records.each do |record|
-          csv << [record[:file_id], record[:date], record[:downloads]]
+        aggregated_works.each do |work|
+          csv << [work[:file_id], work[:date], work[:downloads]]
         end
       end
     end
 
     def migrate_to_new_table(csv_path)
       csv_data = CSV.read(csv_path, headers: true)
-      records = csv_data.map { |row| row.to_h.symbolize_keys }
+      works = csv_data.map { |row| row.to_h.symbolize_keys }
 
       # Recreate or update objects in new table
-      records.each do |stat|
+      works.each do |stat|
         create_hyc_download_stat(stat)
       end
     end
 
+    # Log progress at 25%, 50%, 75%, and 100%
+    def log_progress(works_retrieved_from_query_count, total_works, is_aggregation = false)
+        percentages = [0.25, 0.5, 0.75, 1.0]
+        log_intervals = percentages.map { |percent| (total_works * percent).to_i }    
+        if log_intervals.include?(works_retrieved_from_query_count)
+          percentage_done = percentages[log_intervals.index(works_retrieved_from_query_count)] * 100
+          process_type = is_aggregation ? 'Aggregation' : 'Retrieval'
+          Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{works_retrieved_from_query_count}/#{total_works} works)")
+        end
+    end
+
     def create_hyc_download_stat(stat)
       hyc_download_stat = HycDownloadStat.find_or_initialize_by(
         fileset_id: stat[:file_id].to_s,
@@ -60,13 +82,13 @@ def work_data_from_stat(stat)
       WorkUtilsHelper.fetch_work_data_by_fileset_id(stat[:file_id])
     end
 
-    def aggregate_downloads(records)
+    def aggregate_downloads(works)
       aggregated_data = {}
 
-      records.each do |record|
-        file_id = record[:file_id]
-        truncated_date = record[:date].beginning_of_month
-        downloads = record[:downloads]
+      works.each do |work|
+        file_id = work[:file_id]
+        truncated_date = work[:date].beginning_of_month
+        downloads = work[:downloads]
 
         # Group the file_id and truncated date to be used as a key
         key = [file_id, truncated_date]
diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake
index 8d2a752f2..53088fa27 100644
--- a/lib/tasks/migrate_download_stats.rake
+++ b/lib/tasks/migrate_download_stats.rake
@@ -23,7 +23,7 @@ namespace :migrate_download_stats do
     end
 
     migration_service = Tasks::DownloadStatsMigrationService.new
-    old_stats_csv = migration_service.list_record_info(options[:output_dir], options[:after])
+    old_stats_csv = migration_service.list_work_info(options[:output_dir], options[:after])
     puts "Listing completed in #{Time.now - start_time}s"
     puts "Stored id list to file: #{options[:output_dir]}"
     exit 0
diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb
index abe9a4791..edc45b19f 100644
--- a/spec/services/tasks/download_stats_migration_service_spec.rb
+++ b/spec/services/tasks/download_stats_migration_service_spec.rb
@@ -43,13 +43,13 @@
     end
   end
 
-  describe '#list_record_info' do
-    it 'writes all records to the output CSV file' do
+  describe '#list_work_info' do
+    it 'writes all works to the output CSV file' do
       file_download_stats.flatten.each_with_index do |stat, index|
         allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] })
       end
 
-      expected_records = [
+      expected_works = [
         { file_id: 'file_id_1', date: '2023-01-01 00:00:00 UTC', downloads: '20' },
         { file_id: 'file_id_1', date: '2023-03-01 00:00:00 UTC', downloads: '20' },
         { file_id: 'file_id_2', date: '2023-04-01 00:00:00 UTC', downloads: '20' },
@@ -57,10 +57,10 @@
         { file_id: 'file_id_3', date: '2023-06-01 00:00:00 UTC', downloads: '20' },
         { file_id: 'file_id_3', date: '2023-07-01 00:00:00 UTC', downloads: '20' }
       ]
-      service.list_record_info(output_path, nil)
+      service.list_work_info(output_path, nil)
 
       expect(File).to exist(output_path)
-      expect(csv_to_hash_array(output_path)).to match_array(expected_records)
+      expect(csv_to_hash_array(output_path)).to match_array(expected_works)
     end
 
     context 'with an after_timestamp' do
@@ -79,14 +79,14 @@
         end
       end
 
-      it 'filters records by the given timestamp' do
-        # Retrieve records created after 'updated_at' date for old stats
-        service.list_record_info(output_path, '2023-04-06 00:00:00 UTC')
+      it 'filters works by the given timestamp' do
+        # Retrieve works created after 'updated_at' date for old stats
+        service.list_work_info(output_path, '2023-04-06 00:00:00 UTC')
         puts "CSV data: #{csv_to_hash_array(output_path).inspect}"
 
         expect(File).to exist(output_path)
-        expect(csv_to_hash_array(output_path).map { |record| record[:file_id] }).to match_array(recent_stat_file_ids)
-        expect(csv_to_hash_array(output_path).map { |record| record[:file_id] }).not_to include(*old_stat_file_ids)
+        expect(csv_to_hash_array(output_path).map { |work| work[:file_id] }).to match_array(recent_stat_file_ids)
+        expect(csv_to_hash_array(output_path).map { |work| work[:file_id] }).not_to include(*old_stat_file_ids)
       end
     end
   end
@@ -96,20 +96,20 @@
       file_download_stats.flatten.each_with_index do |stat, index|
         allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] })
       end
-      service.list_record_info(output_path, nil)
+      service.list_work_info(output_path, nil)
       service.migrate_to_new_table(output_path)
     end
 
-    it 'creates new HycDownloadStat records from the CSV file' do
-      csv_to_hash_array(output_path).each_with_index do |record, index|
-        work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(record[:file_id])
-        hyc_download_stat = HycDownloadStat.find_by(fileset_id: record[:file_id], date: record[:date].to_date.beginning_of_month)
+    it 'creates new HycDownloadStat works from the CSV file' do
+      csv_to_hash_array(output_path).each_with_index do |work, index|
+        work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(work[:file_id])
+        hyc_download_stat = HycDownloadStat.find_by(fileset_id: work[:file_id], date: work[:date].to_date.beginning_of_month)
 
         expect(hyc_download_stat).to be_present
-        expect(hyc_download_stat.fileset_id).to eq(record[:file_id])
+        expect(hyc_download_stat.fileset_id).to eq(work[:file_id])
         expect(hyc_download_stat.work_id).to eq(work_data[:work_id])
-        expect(hyc_download_stat.date).to eq(record[:date].to_date)
-        # Each mocked record has 10 downloads per month, so the download count should be 20
+        expect(hyc_download_stat.date).to eq(work[:date].to_date)
+        # Each mocked work has 10 downloads per month, so the download count should be 20
         expect(hyc_download_stat.download_count).to eq(20)
       end
     end
@@ -122,7 +122,7 @@ def csv_to_hash_array(file_path)
 
   # Helper method to convert an array of FileDownloadStat objects to an array of hashes
   # Checks for truncated date to the beginning of the month
-  def expected_records_for(stats)
+  def expected_works_for(stats)
     stats.map do |stat|
       {
         file_id: stat.file_id,

From d335b7251654362bc5fc094579b9a3708410f115 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Mon, 26 Aug 2024 11:07:21 -0400
Subject: [PATCH 19/32] rubocop

---
 .../tasks/download_stats_migration_service.rb      | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index f460fac7d..d01de5b7f 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -50,13 +50,13 @@ def migrate_to_new_table(csv_path)
 
     # Log progress at 25%, 50%, 75%, and 100%
     def log_progress(works_retrieved_from_query_count, total_works, is_aggregation = false)
-        percentages = [0.25, 0.5, 0.75, 1.0]
-        log_intervals = percentages.map { |percent| (total_works * percent).to_i }    
-        if log_intervals.include?(works_retrieved_from_query_count)
-          percentage_done = percentages[log_intervals.index(works_retrieved_from_query_count)] * 100
-          process_type = is_aggregation ? 'Aggregation' : 'Retrieval'
-          Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{works_retrieved_from_query_count}/#{total_works} works)")
-        end
+      percentages = [0.25, 0.5, 0.75, 1.0]
+      log_intervals = percentages.map { |percent| (total_works * percent).to_i }
+      if log_intervals.include?(works_retrieved_from_query_count)
+        percentage_done = percentages[log_intervals.index(works_retrieved_from_query_count)] * 100
+        process_type = is_aggregation ? 'Aggregation' : 'Retrieval'
+        Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{works_retrieved_from_query_count}/#{total_works} works)")
+      end
     end
 
     def create_hyc_download_stat(stat)

From aa9bd558cba303f9e0ffb7dd8252ea4ea105c6ab Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Mon, 26 Aug 2024 11:21:50 -0400
Subject: [PATCH 20/32] phrasing change and logging

---
 .../tasks/download_stats_migration_service.rb | 69 ++++++++++---------
 lib/tasks/migrate_download_stats.rake         | 10 +--
 .../download_stats_migration_service_spec.rb  |  8 +--
 3 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index d01de5b7f..4444f6671 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -2,60 +2,63 @@
 module Tasks
   class DownloadStatsMigrationService
     PAGE_SIZE = 1000
-    def list_work_info(output_path, after_timestamp = nil)
+    def list_work_stat_info(output_path, after_timestamp = nil)
       query = FileDownloadStat.all
       query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
-      total_works = query.count
+      total_work_stats = query.count
       timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp'
 
-     # Log number of works retrieved and timestamp clause
-      Rails.logger.info("Listing works #{timestamp_clause} to #{output_path}")
-      Rails.logger.info("#{total_works} works from the hyrax local cache.")
+     # Log number of work_stats retrieved and timestamp clause
+      Rails.logger.info("Listing work_stats #{timestamp_clause} to #{output_path}")
+      Rails.logger.info("#{total_work_stats} work_stats from the hyrax local cache.")
 
-      works = []
-      works_retrieved_from_query_count = 0
+      work_stats = []
+      work_stats_retrieved_from_query_count = 0
 
-      Rails.logger.info("Retrieving works in batches of #{PAGE_SIZE}")
-     # Fetch the works in batches
-      query.find_each(batch_size: PAGE_SIZE) do |work|
-        works << {
-          file_id: work.file_id,
-          date: work.date,
-          downloads: work.downloads
+      Rails.logger.info("Retrieving work_stats in batches of #{PAGE_SIZE}")
+     # Fetch the work_stats in batches
+      query.find_each(batch_size: PAGE_SIZE) do |stat|
+        work_stats << {
+          file_id: stat.file_id,
+          date: stat.date,
+          downloads: stat.downloads
         }
-        works_retrieved_from_query_count += 1
-        log_progress(works_retrieved_from_query_count, total_works)
+        work_stats_retrieved_from_query_count += 1
+        log_progress(work_stats_retrieved_from_query_count, total_work_stats)
       end
 
      # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries
-      aggregated_works = aggregate_downloads(works)
+      Rails.logger.info("Aggregating daily stats into monthly stats")
+      aggregated_work_stats = aggregate_downloads(work_stats)
 
-      # Write the works to the specified CSV file
+      Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{work_stats.count} daily stats")
+
+      # Write the work_stats to the specified CSV file
       CSV.open(output_path, 'w', write_headers: true, headers: ['file_id', 'date', 'downloads']) do |csv|
-        aggregated_works.each do |work|
-          csv << [work[:file_id], work[:date], work[:downloads]]
+        aggregated_work_stats.each do |stat|
+          csv << [stat[:file_id], stat[:date], stat[:downloads]]
         end
       end
     end
 
     def migrate_to_new_table(csv_path)
       csv_data = CSV.read(csv_path, headers: true)
-      works = csv_data.map { |row| row.to_h.symbolize_keys }
+      work_stats = csv_data.map { |row| row.to_h.symbolize_keys }
 
       # Recreate or update objects in new table
-      works.each do |stat|
+      work_stats.each do |stat|
         create_hyc_download_stat(stat)
       end
     end
 
     # Log progress at 25%, 50%, 75%, and 100%
-    def log_progress(works_retrieved_from_query_count, total_works, is_aggregation = false)
+    def log_progress(work_stats_count, total_work_stats, is_aggregation = false)
       percentages = [0.25, 0.5, 0.75, 1.0]
-      log_intervals = percentages.map { |percent| (total_works * percent).to_i }
-      if log_intervals.include?(works_retrieved_from_query_count)
-        percentage_done = percentages[log_intervals.index(works_retrieved_from_query_count)] * 100
+      log_intervals = percentages.map { |percent| (total_work_stats * percent).to_i }
+      if log_intervals.include?(work_stats_count)
+        percentage_done = percentages[log_intervals.index(work_stats_count)] * 100
         process_type = is_aggregation ? 'Aggregation' : 'Retrieval'
-        Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{works_retrieved_from_query_count}/#{total_works} works)")
+        Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{work_stats_count}/#{total_work_stats} work_stats)")
       end
     end
 
@@ -82,13 +85,14 @@ def work_data_from_stat(stat)
       WorkUtilsHelper.fetch_work_data_by_fileset_id(stat[:file_id])
     end
 
-    def aggregate_downloads(works)
+    def aggregate_downloads(work_stats)
       aggregated_data = {}
+      aggregated_work_stats_count = 0
 
-      works.each do |work|
-        file_id = work[:file_id]
-        truncated_date = work[:date].beginning_of_month
-        downloads = work[:downloads]
+      work_stats.each do |stat|
+        file_id = stat[:file_id]
+        truncated_date = stat[:date].beginning_of_month
+        downloads = stat[:downloads]
 
         # Group the file_id and truncated date to be used as a key
         key = [file_id, truncated_date]
@@ -96,6 +100,7 @@ def aggregate_downloads(works)
         aggregated_data[key] ||= { file_id: file_id, date: truncated_date, downloads: 0 }
         # Sum the downloads for each key
         aggregated_data[key][:downloads] += downloads
+        log_progress(aggregated_work_stats_count, work_stats.count, is_aggregation: true)
       end
       aggregated_data.values
     end
diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake
index 53088fa27..15edc556e 100644
--- a/lib/tasks/migrate_download_stats.rake
+++ b/lib/tasks/migrate_download_stats.rake
@@ -4,14 +4,14 @@ require 'optparse'
 require 'optparse/date'
 
 namespace :migrate_download_stats do
-  desc 'output object record data for download stats migration into a csv'
-  task :list_record_data, [:output_dir, :after] => :environment do |_t, _args|
+  desc 'output rows for download stat migration into a csv'
+  task :list_rows, [:output_dir, :after] => :environment do |_t, _args|
     start_time = Time.now
-    puts "[#{start_time.utc.iso8601}] starting listing of record data"
+    puts "[#{start_time.utc.iso8601}] starting listing of work data"
     options = {}
 
     opts = OptionParser.new
-    opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_record_data -- [options]'
+    opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_rows -- [options]'
     opts.on('-o', '--output-dir ARG', String, 'Directory list will be saved to') { |val| options[:output_dir] = val }
     opts.on('-a', '--after ARG', String, 'List objects which have been updated after this timestamp') { |val| options[:after] = val }
     args = opts.order!(ARGV) {}
@@ -23,7 +23,7 @@ namespace :migrate_download_stats do
     end
 
     migration_service = Tasks::DownloadStatsMigrationService.new
-    old_stats_csv = migration_service.list_work_info(options[:output_dir], options[:after])
+    old_stats_csv = migration_service.list_work_stat_info(options[:output_dir], options[:after])
     puts "Listing completed in #{Time.now - start_time}s"
     puts "Stored id list to file: #{options[:output_dir]}"
     exit 0
diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb
index edc45b19f..60a9ccc02 100644
--- a/spec/services/tasks/download_stats_migration_service_spec.rb
+++ b/spec/services/tasks/download_stats_migration_service_spec.rb
@@ -43,7 +43,7 @@
     end
   end
 
-  describe '#list_work_info' do
+  describe '#list_work_stat_info' do
     it 'writes all works to the output CSV file' do
       file_download_stats.flatten.each_with_index do |stat, index|
         allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] })
@@ -57,7 +57,7 @@
         { file_id: 'file_id_3', date: '2023-06-01 00:00:00 UTC', downloads: '20' },
         { file_id: 'file_id_3', date: '2023-07-01 00:00:00 UTC', downloads: '20' }
       ]
-      service.list_work_info(output_path, nil)
+      service.list_work_stat_info(output_path, nil)
 
       expect(File).to exist(output_path)
       expect(csv_to_hash_array(output_path)).to match_array(expected_works)
@@ -81,7 +81,7 @@
 
       it 'filters works by the given timestamp' do
         # Retrieve works created after 'updated_at' date for old stats
-        service.list_work_info(output_path, '2023-04-06 00:00:00 UTC')
+        service.list_work_stat_info(output_path, '2023-04-06 00:00:00 UTC')
         puts "CSV data: #{csv_to_hash_array(output_path).inspect}"
 
         expect(File).to exist(output_path)
@@ -96,7 +96,7 @@
       file_download_stats.flatten.each_with_index do |stat, index|
         allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] })
       end
-      service.list_work_info(output_path, nil)
+      service.list_work_stat_info(output_path, nil)
       service.migrate_to_new_table(output_path)
     end
 

From a914e06df4324813fc1bb24f3d4ea5b8a3ad13e5 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Mon, 26 Aug 2024 11:31:55 -0400
Subject: [PATCH 21/32] slight logging change and moving csv writing into
 another function

---
 .../tasks/download_stats_migration_service.rb | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index 4444f6671..92fec6b47 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -8,9 +8,8 @@ def list_work_stat_info(output_path, after_timestamp = nil)
       total_work_stats = query.count
       timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp'
 
-     # Log number of work_stats retrieved and timestamp clause
-      Rails.logger.info("Listing work_stats #{timestamp_clause} to #{output_path}")
-      Rails.logger.info("#{total_work_stats} work_stats from the hyrax local cache.")
+     # Log number of work stats retrieved and timestamp clause
+      Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.")
 
       work_stats = []
       work_stats_retrieved_from_query_count = 0
@@ -30,15 +29,10 @@ def list_work_stat_info(output_path, after_timestamp = nil)
      # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries
       Rails.logger.info("Aggregating daily stats into monthly stats")
       aggregated_work_stats = aggregate_downloads(work_stats)
-
       Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{work_stats.count} daily stats")
 
       # Write the work_stats to the specified CSV file
-      CSV.open(output_path, 'w', write_headers: true, headers: ['file_id', 'date', 'downloads']) do |csv|
-        aggregated_work_stats.each do |stat|
-          csv << [stat[:file_id], stat[:date], stat[:downloads]]
-        end
-      end
+      write_to_csv(output_path, aggregated_work_stats)
     end
 
     def migrate_to_new_table(csv_path)
@@ -104,5 +98,16 @@ def aggregate_downloads(work_stats)
       end
       aggregated_data.values
     end
+
+    # Method to write work stats to a CSV file
+    def write_to_csv(output_path, work_stats, headers = ['file_id', 'date', 'downloads'])
+      CSV.open(output_path, 'w', write_headers: true, headers: headers) do |csv|
+        work_stats.each do |stat|
+          csv << [stat[:file_id], stat[:date], stat[:downloads]]
+        end
+      end
+      Rails.logger.info("Work stats successfully written to #{output_path}")
+    end
+
   end
 end

From 59145a5a9721648c112855addb4d5d1ef164fdb4 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Mon, 26 Aug 2024 11:34:00 -0400
Subject: [PATCH 22/32] rubocop

---
 app/services/tasks/download_stats_migration_service.rb | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index 92fec6b47..c4c37f8c6 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -27,7 +27,7 @@ def list_work_stat_info(output_path, after_timestamp = nil)
       end
 
      # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries
-      Rails.logger.info("Aggregating daily stats into monthly stats")
+      Rails.logger.info('Aggregating daily stats into monthly stats')
       aggregated_work_stats = aggregate_downloads(work_stats)
       Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{work_stats.count} daily stats")
 
@@ -45,6 +45,8 @@ def migrate_to_new_table(csv_path)
       end
     end
 
+    private
+
     # Log progress at 25%, 50%, 75%, and 100%
     def log_progress(work_stats_count, total_work_stats, is_aggregation = false)
       percentages = [0.25, 0.5, 0.75, 1.0]

From df9279db84d5a09f53303fa6c6e48be7a9f56023 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Mon, 26 Aug 2024 12:41:02 -0400
Subject: [PATCH 23/32] changed arguments for log progress function

---
 .../tasks/download_stats_migration_service.rb        | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index c4c37f8c6..e8cb37943 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -23,7 +23,7 @@ def list_work_stat_info(output_path, after_timestamp = nil)
           downloads: stat.downloads
         }
         work_stats_retrieved_from_query_count += 1
-        log_progress(work_stats_retrieved_from_query_count, total_work_stats)
+        log_progress(work_stats_retrieved_from_query_count, total_work_stats, 'Retrieval')
       end
 
      # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries
@@ -38,22 +38,25 @@ def list_work_stat_info(output_path, after_timestamp = nil)
     def migrate_to_new_table(csv_path)
       csv_data = CSV.read(csv_path, headers: true)
       work_stats = csv_data.map { |row| row.to_h.symbolize_keys }
+      migrated_work_stats_count = 0
 
+      Rails.logger.info("Migrating #{work_stats.count} work stats to the new table.")
       # Recreate or update objects in new table
       work_stats.each do |stat|
         create_hyc_download_stat(stat)
+        migrated_work_stats_count += 1
+        log_progress(migrated_work_stats_count, work_stats.count, 'Migration')
       end
     end
 
     private
 
     # Log progress at 25%, 50%, 75%, and 100%
-    def log_progress(work_stats_count, total_work_stats, is_aggregation = false)
+    def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval')
       percentages = [0.25, 0.5, 0.75, 1.0]
       log_intervals = percentages.map { |percent| (total_work_stats * percent).to_i }
       if log_intervals.include?(work_stats_count)
         percentage_done = percentages[log_intervals.index(work_stats_count)] * 100
-        process_type = is_aggregation ? 'Aggregation' : 'Retrieval'
         Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{work_stats_count}/#{total_work_stats} work_stats)")
       end
     end
@@ -96,7 +99,8 @@ def aggregate_downloads(work_stats)
         aggregated_data[key] ||= { file_id: file_id, date: truncated_date, downloads: 0 }
         # Sum the downloads for each key
         aggregated_data[key][:downloads] += downloads
-        log_progress(aggregated_work_stats_count, work_stats.count, is_aggregation: true)
+        aggregated_work_stats_count += 1
+        log_progress(aggregated_work_stats_count, work_stats.count, 'Aggregation')
       end
       aggregated_data.values
     end

From 27d54b7ad078d9ac651b28681a603370863aa0de Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Mon, 26 Aug 2024 13:43:18 -0400
Subject: [PATCH 24/32] progress tracking in migration service

---
 .../tasks/download_stats_migration_service.rb | 94 ++++++++++++++-----
 1 file changed, 72 insertions(+), 22 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index e8cb37943..456923025 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -37,16 +37,24 @@ def list_work_stat_info(output_path, after_timestamp = nil)
 
     def migrate_to_new_table(csv_path)
       csv_data = CSV.read(csv_path, headers: true)
-      work_stats = csv_data.map { |row| row.to_h.symbolize_keys }
-      migrated_work_stats_count = 0
-
-      Rails.logger.info("Migrating #{work_stats.count} work stats to the new table.")
+      csv_data_stats = csv_data.map { |row| row.to_h.symbolize_keys }
+      progress_tracker = {
+        all_categories: 0,
+        created: 0,
+        updated: 0,
+        skipped: 0,
+        failed: 0
+      }
+
+      Rails.logger.info("Migrating #{csv_data_stats.count} work stats to the new table.")
       # Recreate or update objects in new table
-      work_stats.each do |stat|
-        create_hyc_download_stat(stat)
-        migrated_work_stats_count += 1
-        log_progress(migrated_work_stats_count, work_stats.count, 'Migration')
+      csv_data_stats.each do |stat|
+        create_hyc_download_stat(stat, progress_tracker)
+        # WIP: Thinking about moving updates into an update progress tracker method
+        # progress_tracker[:all_categories] += 1
+        log_progress(progress_tracker[:all_categories], csv_data_stats.count, 'Migration')
       end
+      
     end
 
     private
@@ -61,21 +69,45 @@ def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval')
       end
     end
 
-    def create_hyc_download_stat(stat)
-      hyc_download_stat = HycDownloadStat.find_or_initialize_by(
-        fileset_id: stat[:file_id].to_s,
-        date: stat[:date]
-    )
-      work_data = work_data_from_stat(stat)
-      hyc_download_stat.assign_attributes(
-        fileset_id: stat[:file_id],
-        work_id: work_data[:work_id],
-        admin_set_id: work_data[:admin_set_id],
-        work_type: work_data[:work_type],
-        date: stat[:date],
-        download_count: stat[:downloads],
+    def create_hyc_download_stat(stat, progress_tracker)
+      begin
+        hyc_download_stat = HycDownloadStat.find_or_initialize_by(
+          fileset_id: stat[:file_id].to_s,
+          date: stat[:date]
       )
-      hyc_download_stat.save
+        work_data = work_data_from_stat(stat)
+        hyc_download_stat.assign_attributes(
+          fileset_id: stat[:file_id],
+          work_id: work_data[:work_id],
+          admin_set_id: work_data[:admin_set_id],
+          work_type: work_data[:work_type],
+          date: stat[:date],
+          download_count: stat[:downloads],
+        )
+      rescue StandardError => e
+        Rails.logger.error("Failed to create HycDownloadStat for #{stat.inspect}: #{e.message}")
+        progress_tracker[:failed] += 1
+      end
+      save_hyc_download_stat(hyc_download_stat, stat, progress_tracker)
+
+        # if hyc_download_stat.new_record?
+        #   if hyc_download_stat.save
+        #     return :created
+        #   else
+        #     Rails.logger.error("Failed to create HycDownloadStat for #{stat.inspect}: #{hyc_download_stat.errors.full_messages.join(', ')}")
+        #     return :failed
+        #   end
+        # elsif hyc_download_stat.changed?
+        #   if hyc_download_stat.save
+        #     return :updated
+        #   else
+        #     Rails.logger.error("Failed to update HycDownloadStat for #{stat.inspect}: #{hyc_download_stat.errors.full_messages.join(', ')}")
+        #     return :failed
+        #   end
+        # else
+        #   return :skipped
+        # end
+      
     end
 
     # Similar implementation to work_data in DownloadAnalyticsBehavior
@@ -115,5 +147,23 @@ def write_to_csv(output_path, work_stats, headers = ['file_id', 'date', 'downloa
       Rails.logger.info("Work stats successfully written to #{output_path}")
     end
 
+    # Method to save the HycDownloadStat object and update the progress tracker
+    def save_hyc_download_stat(hyc_download_stat, stat, progress_tracker)
+      begin
+        if hyc_download_stat.new_record?
+          hyc_download_stat.save
+          progress_tracker[:created] += 1
+        elsif hyc_download_stat.changed?
+          hyc_download_stat.save
+          progress_tracker[:updated] += 1
+        else
+          progress_tracker[:skipped] += 1
+        end
+      rescue StandardError => e
+        Rails.logger.error("Error saving new row to HycDownloadStat: #{stat.inspect}: #{e.message}")
+        progress_tracker[:failed] += 1
+      end
+    end    
+
   end
 end

From 50058c4f9ac3792fd8853a7bbca36f930fc58f8e Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Mon, 26 Aug 2024 14:56:53 -0400
Subject: [PATCH 25/32] error handling

---
 .../tasks/download_stats_migration_service.rb | 121 ++++++++----------
 1 file changed, 55 insertions(+), 66 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index 456923025..6c92ee488 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -3,58 +3,66 @@ module Tasks
   class DownloadStatsMigrationService
     PAGE_SIZE = 1000
     def list_work_stat_info(output_path, after_timestamp = nil)
-      query = FileDownloadStat.all
-      query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
-      total_work_stats = query.count
-      timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp'
-
-     # Log number of work stats retrieved and timestamp clause
-      Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.")
-
-      work_stats = []
-      work_stats_retrieved_from_query_count = 0
-
-      Rails.logger.info("Retrieving work_stats in batches of #{PAGE_SIZE}")
-     # Fetch the work_stats in batches
-      query.find_each(batch_size: PAGE_SIZE) do |stat|
-        work_stats << {
-          file_id: stat.file_id,
-          date: stat.date,
-          downloads: stat.downloads
-        }
-        work_stats_retrieved_from_query_count += 1
-        log_progress(work_stats_retrieved_from_query_count, total_work_stats, 'Retrieval')
-      end
+      begin
+        query = FileDownloadStat.all
+        query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
+        total_work_stats = query.count
+        timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp'
+
+      # Log number of work stats retrieved and timestamp clause
+        Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.")
+
+        work_stats = []
+        work_stats_retrieved_from_query_count = 0
+
+        Rails.logger.info("Retrieving work_stats in batches of #{PAGE_SIZE}")
+      # Fetch the work_stats in batches
+        query.find_each(batch_size: PAGE_SIZE) do |stat|
+          work_stats << {
+            file_id: stat.file_id,
+            date: stat.date,
+            downloads: stat.downloads
+          }
+          work_stats_retrieved_from_query_count += 1
+          log_progress(work_stats_retrieved_from_query_count, total_work_stats, 'Retrieval')
+        end
 
-     # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries
-      Rails.logger.info('Aggregating daily stats into monthly stats')
-      aggregated_work_stats = aggregate_downloads(work_stats)
-      Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{work_stats.count} daily stats")
+      # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries
+        Rails.logger.info('Aggregating daily stats into monthly stats')
+        aggregated_work_stats = aggregate_downloads(work_stats)
+        Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{work_stats.count} daily stats")
 
-      # Write the work_stats to the specified CSV file
-      write_to_csv(output_path, aggregated_work_stats)
+        # Write the work_stats to the specified CSV file
+        write_to_csv(output_path, aggregated_work_stats)
+      rescue StandardError => e
+        Rails.logger.error("An error occurred while listing work stats: #{e.message}")
+        Rails.logger.error(e.backtrace.join("\n"))
+      end
     end
 
     def migrate_to_new_table(csv_path)
-      csv_data = CSV.read(csv_path, headers: true)
-      csv_data_stats = csv_data.map { |row| row.to_h.symbolize_keys }
-      progress_tracker = {
-        all_categories: 0,
-        created: 0,
-        updated: 0,
-        skipped: 0,
-        failed: 0
-      }
-
-      Rails.logger.info("Migrating #{csv_data_stats.count} work stats to the new table.")
-      # Recreate or update objects in new table
-      csv_data_stats.each do |stat|
-        create_hyc_download_stat(stat, progress_tracker)
-        # WIP: Thinking about moving updates into an update progress tracker method
-        # progress_tracker[:all_categories] += 1
-        log_progress(progress_tracker[:all_categories], csv_data_stats.count, 'Migration')
+      begin
+        csv_data = CSV.read(csv_path, headers: true)
+        csv_data_stats = csv_data.map { |row| row.to_h.symbolize_keys }
+        progress_tracker = {
+          all_categories: 0,
+          created: 0,
+          updated: 0,
+          skipped: 0,
+          failed: 0
+        }
+
+        Rails.logger.info("Migrating #{csv_data_stats.count} work stats to the new table.")
+        # Recreate or update objects in new table
+        csv_data_stats.each do |stat|
+          create_hyc_download_stat(stat, progress_tracker)
+          log_progress(progress_tracker[:all_categories], csv_data_stats.count, 'Migration')
+        end
+        Rails.logger.info("Migration complete: #{progress_tracker[:created]} created, #{progress_tracker[:updated]} updated, #{progress_tracker[:skipped]} skipped, #{progress_tracker[:failed]} failed")
+      rescue StandardError => e
+        Rails.logger.error("An error occurred while migrating work stats: #{e.message}")
+        Rails.logger.error(e.backtrace.join("\n"))
       end
-      
     end
 
     private
@@ -89,25 +97,6 @@ def create_hyc_download_stat(stat, progress_tracker)
         progress_tracker[:failed] += 1
       end
       save_hyc_download_stat(hyc_download_stat, stat, progress_tracker)
-
-        # if hyc_download_stat.new_record?
-        #   if hyc_download_stat.save
-        #     return :created
-        #   else
-        #     Rails.logger.error("Failed to create HycDownloadStat for #{stat.inspect}: #{hyc_download_stat.errors.full_messages.join(', ')}")
-        #     return :failed
-        #   end
-        # elsif hyc_download_stat.changed?
-        #   if hyc_download_stat.save
-        #     return :updated
-        #   else
-        #     Rails.logger.error("Failed to update HycDownloadStat for #{stat.inspect}: #{hyc_download_stat.errors.full_messages.join(', ')}")
-        #     return :failed
-        #   end
-        # else
-        #   return :skipped
-        # end
-      
     end
 
     # Similar implementation to work_data in DownloadAnalyticsBehavior
@@ -163,7 +152,7 @@ def save_hyc_download_stat(hyc_download_stat, stat, progress_tracker)
         Rails.logger.error("Error saving new row to HycDownloadStat: #{stat.inspect}: #{e.message}")
         progress_tracker[:failed] += 1
       end
-    end    
+    end
 
   end
 end

From e9ea6dc92eaff109acc3562b2467ead17ef9b267 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Mon, 26 Aug 2024 18:25:34 -0400
Subject: [PATCH 26/32] increment all_categories in progress tracker

---
 app/services/tasks/download_stats_migration_service.rb | 1 +
 1 file changed, 1 insertion(+)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index 6c92ee488..92a035950 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -56,6 +56,7 @@ def migrate_to_new_table(csv_path)
         # Recreate or update objects in new table
         csv_data_stats.each do |stat|
           create_hyc_download_stat(stat, progress_tracker)
+          progress_tracker[:all_categories] += 1
           log_progress(progress_tracker[:all_categories], csv_data_stats.count, 'Migration')
         end
         Rails.logger.info("Migration complete: #{progress_tracker[:created]} created, #{progress_tracker[:updated]} updated, #{progress_tracker[:skipped]} skipped, #{progress_tracker[:failed]} failed")

From 115df4222c1643a535c0f9b4b05141f2c57aa0bc Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Tue, 27 Aug 2024 17:09:51 -0400
Subject: [PATCH 27/32] tests

---
 .../tasks/download_stats_migration_service_spec.rb | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb
index 60a9ccc02..467a61199 100644
--- a/spec/services/tasks/download_stats_migration_service_spec.rb
+++ b/spec/services/tasks/download_stats_migration_service_spec.rb
@@ -63,6 +63,13 @@
       expect(csv_to_hash_array(output_path)).to match_array(expected_works)
     end
 
+    it 'handles and logs errors' do
+      allow(Rails.logger).to receive(:error)
+      allow(FileDownloadStat).to receive(:all).and_raise(StandardError, 'Simulated database query failure')
+      service.list_work_stat_info(output_path, nil)
+      expect(Rails.logger).to have_received(:error).with('An error occurred while listing work stats: Simulated database query failure')
+    end
+
     context 'with an after_timestamp' do
       let(:recent_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: '2023-05-05 00:00:00 UTC') }
       let(:old_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: '2023-04-05 00:00:00 UTC') }
@@ -113,6 +120,13 @@
         expect(hyc_download_stat.download_count).to eq(20)
       end
     end
+
+    it 'handles and logs errors' do
+      allow(CSV).to receive(:read).and_raise(StandardError, 'Simulated CSV read failure')
+      allow(Rails.logger).to receive(:error)
+      service.migrate_to_new_table(output_path)
+      expect(Rails.logger).to have_received(:error).with('An error occurred while migrating work stats: Simulated CSV read failure')
+    end
   end
 
   private

From 177dc77f615db0e60cbad069e90c4574c9b04384 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Tue, 27 Aug 2024 18:09:51 -0400
Subject: [PATCH 28/32] remove comments

---
 spec/factories/solr_query_result.rb | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/spec/factories/solr_query_result.rb b/spec/factories/solr_query_result.rb
index e17445ceb..b718f4e95 100644
--- a/spec/factories/solr_query_result.rb
+++ b/spec/factories/solr_query_result.rb
@@ -9,11 +9,6 @@ def generate_id(n)
 # Factory for creating Solr query results
 FactoryBot.define do
   factory :solr_query_result, class: OpenStruct do
-
-    # General attributes that could be shared between works and admin sets
-    # sequence(:admin_set_id) { |n| "admin_set_id_#{n}" }
-    # sequence(:admin_set_name) { |n| "Admin Set #{n}" }
-
     trait :work do
       # Default values for has_model_ssim, admin_set_tesim, and file_set_ids_ssim
       has_model_ssim { ['Article'] }

From c2d339c303ad05855050f3e4d78412481874fa32 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Wed, 28 Aug 2024 13:29:16 -0400
Subject: [PATCH 29/32] update test to avoid checking for the same value

---
 .../tasks/download_stats_migration_service.rb |  2 +-
 .../download_stats_migration_service_spec.rb  | 51 +++++++++++--------
 2 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index 92a035950..b5edcb7de 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -27,7 +27,7 @@ def list_work_stat_info(output_path, after_timestamp = nil)
           log_progress(work_stats_retrieved_from_query_count, total_work_stats, 'Retrieval')
         end
 
-      # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries
+      # Perform aggregation of daily stats into monthly stats in Ruby, encountered issues with SQL queries
         Rails.logger.info('Aggregating daily stats into monthly stats')
         aggregated_work_stats = aggregate_downloads(work_stats)
         Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{work_stats.count} daily stats")
diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb
index 467a61199..90e5f6cf6 100644
--- a/spec/services/tasks/download_stats_migration_service_spec.rb
+++ b/spec/services/tasks/download_stats_migration_service_spec.rb
@@ -18,25 +18,32 @@
 
   # Smaller groups to allow for easier testing for aggregation of download stats from daily to monthly
   let(:file_download_stats) { [[
-   FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15), downloads: 10, file_id: 'file_id_1'),
-   FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30), downloads: 10, file_id: 'file_id_1'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15), downloads: 5, file_id: 'file_id_1'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30), downloads: 5, file_id: 'file_id_1'),
    FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 15), downloads: 10, file_id: 'file_id_1'),
    FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 30), downloads: 10, file_id: 'file_id_1'),
  ],
  [
-   FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 15), downloads: 10, file_id: 'file_id_2'),
-   FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 30), downloads: 10, file_id: 'file_id_2'),
-   FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 15), downloads: 10, file_id: 'file_id_2'),
-   FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 30), downloads: 10, file_id: 'file_id_2'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 15), downloads: 25, file_id: 'file_id_2'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 30), downloads: 25, file_id: 'file_id_2'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 15), downloads: 50, file_id: 'file_id_2'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 30), downloads: 50, file_id: 'file_id_2'),
  ],
  [
-   FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15), downloads: 10, file_id: 'file_id_3'),
-   FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30), downloads: 10, file_id: 'file_id_3'),
-   FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 15), downloads: 10, file_id: 'file_id_3'),
-   FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 30), downloads: 10, file_id: 'file_id_3'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15), downloads: 100, file_id: 'file_id_3'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30), downloads: 100, file_id: 'file_id_3'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 15), downloads: 150, file_id: 'file_id_3'),
+   FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 30), downloads: 150, file_id: 'file_id_3'),
  ]]
   }
 
+  # Create a hash of [fileset_id, date.beginning_of_month] => download count for each file_download_stats
+  let(:expected_aggregated_download_count) do
+    file_download_stats.flatten.each_with_object(Hash.new(0)) do |stat, hash|
+      hash[[stat.file_id, stat.date.beginning_of_month.to_datetime]] += stat.downloads
+    end
+  end
+
   let(:mock_works) do
     file_download_stats.flatten.map do |stat|
       FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id])
@@ -50,12 +57,12 @@
       end
 
       expected_works = [
-        { file_id: 'file_id_1', date: '2023-01-01 00:00:00 UTC', downloads: '20' },
+        { file_id: 'file_id_1', date: '2023-01-01 00:00:00 UTC', downloads: '10' },
         { file_id: 'file_id_1', date: '2023-03-01 00:00:00 UTC', downloads: '20' },
-        { file_id: 'file_id_2', date: '2023-04-01 00:00:00 UTC', downloads: '20' },
-        { file_id: 'file_id_2', date: '2023-05-01 00:00:00 UTC', downloads: '20' },
-        { file_id: 'file_id_3', date: '2023-06-01 00:00:00 UTC', downloads: '20' },
-        { file_id: 'file_id_3', date: '2023-07-01 00:00:00 UTC', downloads: '20' }
+        { file_id: 'file_id_2', date: '2023-04-01 00:00:00 UTC', downloads: '50' },
+        { file_id: 'file_id_2', date: '2023-05-01 00:00:00 UTC', downloads: '100' },
+        { file_id: 'file_id_3', date: '2023-06-01 00:00:00 UTC', downloads: '200' },
+        { file_id: 'file_id_3', date: '2023-07-01 00:00:00 UTC', downloads: '300' }
       ]
       service.list_work_stat_info(output_path, nil)
 
@@ -108,16 +115,16 @@
     end
 
     it 'creates new HycDownloadStat works from the CSV file' do
-      csv_to_hash_array(output_path).each_with_index do |work, index|
-        work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(work[:file_id])
-        hyc_download_stat = HycDownloadStat.find_by(fileset_id: work[:file_id], date: work[:date].to_date.beginning_of_month)
+      csv_to_hash_array(output_path).each_with_index do |csv_row, index|
+        work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(csv_row[:file_id])
+        csv_row_date = Date.parse(csv_row[:date]).beginning_of_month
+        hyc_download_stat = HycDownloadStat.find_by(fileset_id: csv_row[:file_id], date: csv_row_date)
 
         expect(hyc_download_stat).to be_present
-        expect(hyc_download_stat.fileset_id).to eq(work[:file_id])
+        expect(hyc_download_stat.fileset_id).to eq(csv_row[:file_id])
         expect(hyc_download_stat.work_id).to eq(work_data[:work_id])
-        expect(hyc_download_stat.date).to eq(work[:date].to_date)
-        # Each mocked work has 10 downloads per month, so the download count should be 20
-        expect(hyc_download_stat.download_count).to eq(20)
+        expect(hyc_download_stat.date).to eq(csv_row[:date].to_date)
+        expect(hyc_download_stat.download_count).to eq(expected_aggregated_download_count[[csv_row[:file_id], csv_row_date]])
       end
     end
 

From c7b2b6009b87f7019113d58e1c3ab9653efa5536 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Wed, 28 Aug 2024 14:36:31 -0400
Subject: [PATCH 30/32] aggregate stats during retrieval

---
 .../tasks/download_stats_migration_service.rb | 50 ++++++-------------
 1 file changed, 15 insertions(+), 35 deletions(-)

diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb
index b5edcb7de..c8865193f 100644
--- a/app/services/tasks/download_stats_migration_service.rb
+++ b/app/services/tasks/download_stats_migration_service.rb
@@ -12,25 +12,25 @@ def list_work_stat_info(output_path, after_timestamp = nil)
       # Log number of work stats retrieved and timestamp clause
         Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.")
 
-        work_stats = []
+        aggregated_data = {}
         work_stats_retrieved_from_query_count = 0
 
-        Rails.logger.info("Retrieving work_stats in batches of #{PAGE_SIZE}")
-      # Fetch the work_stats in batches
+        Rails.logger.info('Retrieving work_stats from the database')
+      # Fetch the work_stats and aggregate them into monthly stats in Ruby, encountered issues with SQL queries
         query.find_each(batch_size: PAGE_SIZE) do |stat|
-          work_stats << {
-            file_id: stat.file_id,
-            date: stat.date,
-            downloads: stat.downloads
-          }
+          truncated_date = stat.date.beginning_of_month
+          # Group the file_id and truncated date to be used as a key
+          key = [stat.file_id, truncated_date]
+          # Initialize the hash for the key if it doesn't exist
+          aggregated_data[key] ||= { file_id: stat.file_id, date: truncated_date, downloads: 0 }
+          # Sum the downloads for each key
+          aggregated_data[key][:downloads] += stat.downloads
           work_stats_retrieved_from_query_count += 1
-          log_progress(work_stats_retrieved_from_query_count, total_work_stats, 'Retrieval')
+          log_progress(work_stats_retrieved_from_query_count, total_work_stats)
         end
 
-      # Perform aggregation of daily stats into monthly stats in Ruby, encountered issues with SQL queries
-        Rails.logger.info('Aggregating daily stats into monthly stats')
-        aggregated_work_stats = aggregate_downloads(work_stats)
-        Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{work_stats.count} daily stats")
+        aggregated_work_stats = aggregated_data.values
+        Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{total_work_stats} daily stats")
 
         # Write the work_stats to the specified CSV file
         write_to_csv(output_path, aggregated_work_stats)
@@ -69,7 +69,7 @@ def migrate_to_new_table(csv_path)
     private
 
     # Log progress at 25%, 50%, 75%, and 100%
-    def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval')
+    def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval and Aggregation')
       percentages = [0.25, 0.5, 0.75, 1.0]
       log_intervals = percentages.map { |percent| (total_work_stats * percent).to_i }
       if log_intervals.include?(work_stats_count)
@@ -106,29 +106,9 @@ def work_data_from_stat(stat)
       WorkUtilsHelper.fetch_work_data_by_fileset_id(stat[:file_id])
     end
 
-    def aggregate_downloads(work_stats)
-      aggregated_data = {}
-      aggregated_work_stats_count = 0
-
-      work_stats.each do |stat|
-        file_id = stat[:file_id]
-        truncated_date = stat[:date].beginning_of_month
-        downloads = stat[:downloads]
-
-        # Group the file_id and truncated date to be used as a key
-        key = [file_id, truncated_date]
-        # Initialize the hash for the key if it doesn't exist
-        aggregated_data[key] ||= { file_id: file_id, date: truncated_date, downloads: 0 }
-        # Sum the downloads for each key
-        aggregated_data[key][:downloads] += downloads
-        aggregated_work_stats_count += 1
-        log_progress(aggregated_work_stats_count, work_stats.count, 'Aggregation')
-      end
-      aggregated_data.values
-    end
-
     # Method to write work stats to a CSV file
     def write_to_csv(output_path, work_stats, headers = ['file_id', 'date', 'downloads'])
+      puts "Inspect work_stats: #{work_stats.inspect}"
       CSV.open(output_path, 'w', write_headers: true, headers: headers) do |csv|
         work_stats.each do |stat|
           csv << [stat[:file_id], stat[:date], stat[:downloads]]

From 184e896a0c16bd61f454ad34c60862f60990fbd8 Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Wed, 28 Aug 2024 15:44:42 -0400
Subject: [PATCH 31/32] tests

---
 .../download_stats_migration_service_spec.rb  | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb
index 90e5f6cf6..04ea74f5d 100644
--- a/spec/services/tasks/download_stats_migration_service_spec.rb
+++ b/spec/services/tasks/download_stats_migration_service_spec.rb
@@ -134,8 +134,29 @@
       service.migrate_to_new_table(output_path)
       expect(Rails.logger).to have_received(:error).with('An error occurred while migrating work stats: Simulated CSV read failure')
     end
+
+    context 'if a failure occurs during a private function' do
+      it 'handles and logs errors from create_hyc_download_stat' do
+        allow(Rails.logger).to receive(:error)
+        # Simulate a failure during the creation of a HycDownloadStat object for a specific file_id
+        allow(HycDownloadStat).to receive(:find_or_initialize_by).and_call_original
+        allow(HycDownloadStat).to receive(:find_or_initialize_by).with({:date=>"2023-03-01 00:00:00 UTC", :fileset_id=>"file_id_1"}).and_raise(StandardError, 'Simulated database query failure')
+        service.migrate_to_new_table(output_path)
+        expect(Rails.logger).to have_received(:error).with(a_string_including('Failed to create HycDownloadStat for'))
+      end
+
+      it 'handles and logs errors from save_hyc_download_stat' do
+        allow(Rails.logger).to receive(:error)
+        # Simulate a failure during the creation of a HycDownloadStat object for a specific file_id
+        allow(HycDownloadStat).to receive(:find_or_initialize_by).and_call_original
+        allow(HycDownloadStat).to receive(:find_or_initialize_by).with({:date=>"2023-03-01 00:00:00 UTC", :fileset_id=>"file_id_1"}).and_raise(StandardError, 'Simulated database query failure')
+        service.migrate_to_new_table(output_path)
+        expect(Rails.logger).to have_received(:error).with(a_string_including('Failed to create HycDownloadStat for'))
+      end
+    end
   end
 
+
   private
   def csv_to_hash_array(file_path)
     CSV.read(file_path, headers: true).map { |row| row.to_h.symbolize_keys }

From 66229d73e6ee505f8e8dcf1d286279d4712a531f Mon Sep 17 00:00:00 2001
From: David Campbell <davidcam-src@proton.me>
Date: Wed, 28 Aug 2024 16:10:25 -0400
Subject: [PATCH 32/32] tests

---
 .../tasks/download_stats_migration_service_spec.rb    | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb
index 04ea74f5d..ff52faa25 100644
--- a/spec/services/tasks/download_stats_migration_service_spec.rb
+++ b/spec/services/tasks/download_stats_migration_service_spec.rb
@@ -114,6 +114,8 @@
       service.migrate_to_new_table(output_path)
     end
 
+    after { HycDownloadStat.delete_all }
+
     it 'creates new HycDownloadStat works from the CSV file' do
       csv_to_hash_array(output_path).each_with_index do |csv_row, index|
         work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(csv_row[:file_id])
@@ -140,18 +142,17 @@
         allow(Rails.logger).to receive(:error)
         # Simulate a failure during the creation of a HycDownloadStat object for a specific file_id
         allow(HycDownloadStat).to receive(:find_or_initialize_by).and_call_original
-        allow(HycDownloadStat).to receive(:find_or_initialize_by).with({:date=>"2023-03-01 00:00:00 UTC", :fileset_id=>"file_id_1"}).and_raise(StandardError, 'Simulated database query failure')
+        allow(HycDownloadStat).to receive(:find_or_initialize_by).with({date: '2023-03-01 00:00:00 UTC', fileset_id: 'file_id_1'}).and_raise(StandardError, 'Simulated database query failure').once
         service.migrate_to_new_table(output_path)
         expect(Rails.logger).to have_received(:error).with(a_string_including('Failed to create HycDownloadStat for'))
       end
 
       it 'handles and logs errors from save_hyc_download_stat' do
         allow(Rails.logger).to receive(:error)
-        # Simulate a failure during the creation of a HycDownloadStat object for a specific file_id
-        allow(HycDownloadStat).to receive(:find_or_initialize_by).and_call_original
-        allow(HycDownloadStat).to receive(:find_or_initialize_by).with({:date=>"2023-03-01 00:00:00 UTC", :fileset_id=>"file_id_1"}).and_raise(StandardError, 'Simulated database query failure')
+        # Simulate a failure during the saving of a HycDownloadStat object for a specific file_id
+        allow_any_instance_of(HycDownloadStat).to receive(:new_record?).and_raise(StandardError, 'Simulated save failure')
         service.migrate_to_new_table(output_path)
-        expect(Rails.logger).to have_received(:error).with(a_string_including('Failed to create HycDownloadStat for'))
+        expect(Rails.logger).to have_received(:error).with(a_string_including('Error saving new row to HycDownloadStat')).at_least(1).times
       end
     end
   end