From c2a2dce22d8f4d31832df73853f431726df6e973 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Mon, 19 Aug 2024 13:34:19 -0400 Subject: [PATCH 01/32] create rake task and service --- .../tasks/download_stats_migration_service.rb | 9 +++++++ lib/tasks/migrate_download_stats.rake | 26 +++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 app/services/tasks/download_stats_migration_service.rb create mode 100644 lib/tasks/migrate_download_stats.rake diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb new file mode 100644 index 000000000..d6ccc5aba --- /dev/null +++ b/app/services/tasks/download_stats_migration_service.rb @@ -0,0 +1,9 @@ +# frozen_string_literal: true +module Tasks + class DownloadStatsMigrationService + PAGE_SIZE = 1000 + def list_object_ids(output_path, after_timestamp = nil) + puts 'Test - Listing object ids for download stats migration' + end + end +end diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake new file mode 100644 index 000000000..12f3e4a3a --- /dev/null +++ b/lib/tasks/migrate_download_stats.rake @@ -0,0 +1,26 @@ +# frozen_string_literal: true +require 'time' +require 'optparse' +require 'optparse/date' + +namespace :migrate_download_stats do + desc 'list object ids for download stats migration' + task :list_ids, [:output_dir, :after] => :environment do |_t, _args| + start_time = Time.now + puts "[#{start_time.utc.iso8601}] starting listing of ids" + options = {} + + opts = OptionParser.new + opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_ids -- [options]' + opts.on('-o', '--output-dir ARG', String, 'Directory list will be saved to') { |val| options[:output_dir] = val } + opts.on('-a', '--after ARG', String, 'List objects which have been updated after this timestamp') { |val| options[:after] = val } + args = opts.order!(ARGV) {} + opts.parse!(args) + + file_path = Tasks::DownloadStatsMigrationService.new.list_object_ids(options[:output_dir], options[:after]) + + puts "Listing completed in #{Time.now - start_time}s" + puts "Stored id list to file: #{file_path}" + exit 0 + end +end From fc8dbe3b088cab8dd21f66f63a12e8178e4fd2d5 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Mon, 19 Aug 2024 15:04:01 -0400 Subject: [PATCH 02/32] tests for listing ids in the download stats migration service --- .../tasks/download_stats_migration_service.rb | 15 +++++- spec/factories/file_download_stats.rb | 12 +++++ .../download_stats_migration_service_spec.rb | 46 +++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 spec/factories/file_download_stats.rb create mode 100644 spec/services/tasks/download_stats_migration_service_spec.rb diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index d6ccc5aba..2d9a11b9a 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -3,7 +3,20 @@ module Tasks class DownloadStatsMigrationService PAGE_SIZE = 1000 def list_object_ids(output_path, after_timestamp = nil) - puts 'Test - Listing object ids for download stats migration' + # Build the query + query = FileDownloadStat.select(:id) + query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? + + # Fetch the IDs in batches + ids = [] + query.find_in_batches(batch_size: PAGE_SIZE) do |batch| + ids.concat(batch.map(&:id)) + end + + # Write the IDs to the specified output file + File.open(output_path, 'w') do |file| + ids.each { |id| file.puts(id) } + end end end end diff --git a/spec/factories/file_download_stats.rb b/spec/factories/file_download_stats.rb new file mode 100644 index 000000000..48f136d5a --- /dev/null +++ b/spec/factories/file_download_stats.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true +FactoryBot.define do + factory :file_download_stat do + sequence(:id) { |n| n } # Auto-incrementing ID + date { FFaker::Time.between(Date.new(2019, 1, 1), Date.new(2024, 12, 31)) } # Random date between a range + downloads { rand(1..50) } + sequence(:file_id) { |n| "file_id_#{n}" } # Unique file ID for each record + created_at { date } + updated_at { date } + user_id { rand(1..100) } + end +end diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb new file mode 100644 index 000000000..5d2ca38a4 --- /dev/null +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true +require 'rails_helper' + +RSpec.describe Tasks::DownloadStatsMigrationService, type: :service do + describe '#list_object_ids' do + let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.txt') } + let!(:file_download_stats) { FactoryBot.create_list(:file_download_stat, 10) } + + before do + # Ensure the output file is removed before each test + File.delete(output_path) if File.exist?(output_path) + end + + it 'writes all IDs to the output file' do + service = described_class.new + service.list_object_ids(output_path, nil) + + expect(File).to exist(output_path) + output = File.read(output_path) + expected_ids = FileDownloadStat.pluck(:id) + output_ids = output.split("\n").map(&:to_i) + + expect(output_ids).to match_array(expected_ids) + end + + context 'with an after_timestamp' do + let(:after_timestamp) { 1.day.ago } + let!(:recent_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: 1.hour.ago) } + + it 'filters records by the given timestamp' do + # Create a file_download_stat that should not be included + old_stat = FactoryBot.create(:file_download_stat, updated_at: 2.days.ago) + + service = described_class.new + service.list_object_ids(output_path, after_timestamp) + + output = File.read(output_path) + expected_ids = recent_stats.pluck(:id) + output_ids = output.split("\n").map(&:to_i) + + expect(output_ids).not_to include(old_stat.id) + expect(output_ids).to match_array(expected_ids) + end + end + end +end From e7a35625ad42b75376d8ff97504c83c7cb10976d Mon Sep 17 00:00:00 2001 From: David Campbell Date: Mon, 19 Aug 2024 15:28:02 -0400 Subject: [PATCH 03/32] update date range in factory --- spec/factories/file_download_stats.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/factories/file_download_stats.rb b/spec/factories/file_download_stats.rb index 48f136d5a..8ad1892e5 100644 --- a/spec/factories/file_download_stats.rb +++ b/spec/factories/file_download_stats.rb @@ -2,7 +2,7 @@ FactoryBot.define do factory :file_download_stat do sequence(:id) { |n| n } # Auto-incrementing ID - date { FFaker::Time.between(Date.new(2019, 1, 1), Date.new(2024, 12, 31)) } # Random date between a range + date { FFaker::Time.between(Date.new(2019, 1, 1), Date.new(2022, 1, 31)) } # Random date between a range downloads { rand(1..50) } sequence(:file_id) { |n| "file_id_#{n}" } # Unique file ID for each record created_at { date } From 33bea2ca06b8bb7835931981bb5fa47ea44a56fb Mon Sep 17 00:00:00 2001 From: David Campbell Date: Wed, 21 Aug 2024 12:15:02 -0400 Subject: [PATCH 04/32] expand function to retrieve fields besides id, test refactor, enforce csv dir --- .../tasks/download_stats_migration_service.rb | 27 ++++++--- lib/tasks/migrate_download_stats.rake | 7 ++- .../download_stats_migration_service_spec.rb | 55 ++++++++++++++----- 3 files changed, 67 insertions(+), 22 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index 2d9a11b9a..36ca1c6ef 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -2,20 +2,33 @@ module Tasks class DownloadStatsMigrationService PAGE_SIZE = 1000 - def list_object_ids(output_path, after_timestamp = nil) + def list_record_info(output_path, after_timestamp = nil) + # Build the query - query = FileDownloadStat.select(:id) + query = FileDownloadStat.select(:id, :date, :downloads, :file_id) query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? # Fetch the IDs in batches - ids = [] + records = [] query.find_in_batches(batch_size: PAGE_SIZE) do |batch| - ids.concat(batch.map(&:id)) + records.concat(batch.map { |record| { id: record.id, date: record.date, downloads: record.downloads, file_id: record.file_id } }) + end + + # Write the records to the specified CSV file + CSV.open(output_path, 'w', write_headers: true, headers: ['id', 'date', 'downloads', 'file_id']) do |csv| + records.each do |record| + csv << [record[:id], record[:date], record[:downloads], record[:file_id]] + end end + end + + def replicate_to_new_table(id_list_file, clean_index) + # Read the list of IDs from the file and strip whitespace + ids = File.readlines(id_list_file).map(&:strip) - # Write the IDs to the specified output file - File.open(output_path, 'w') do |file| - ids.each { |id| file.puts(id) } + # Reindex the objects + ids.each do |id| + hyc_download_stat = HycDownloadStat.find_or_initialize_by(id) end end end diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake index 12f3e4a3a..09926a995 100644 --- a/lib/tasks/migrate_download_stats.rake +++ b/lib/tasks/migrate_download_stats.rake @@ -17,7 +17,12 @@ namespace :migrate_download_stats do args = opts.order!(ARGV) {} opts.parse!(args) - file_path = Tasks::DownloadStatsMigrationService.new.list_object_ids(options[:output_dir], options[:after]) + unless options[:output_dir].present? && options[:output_dir][-4] == '.csv' + puts 'Please provide a valid output directory with a .csv extension' + exit 1 + end + + file_path = Tasks::DownloadStatsMigrationService.new.list_record_info(options[:output_dir], options[:after]) puts "Listing completed in #{Time.now - start_time}s" puts "Stored id list to file: #{file_path}" diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb index 5d2ca38a4..c709af5c0 100644 --- a/spec/services/tasks/download_stats_migration_service_spec.rb +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -2,8 +2,8 @@ require 'rails_helper' RSpec.describe Tasks::DownloadStatsMigrationService, type: :service do - describe '#list_object_ids' do - let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.txt') } + describe '#list_record_info' do + let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') } let!(:file_download_stats) { FactoryBot.create_list(:file_download_stat, 10) } before do @@ -11,16 +11,26 @@ File.delete(output_path) if File.exist?(output_path) end - it 'writes all IDs to the output file' do + it 'writes all records to the output CSV file' do service = described_class.new - service.list_object_ids(output_path, nil) + service.list_record_info(output_path, nil) expect(File).to exist(output_path) - output = File.read(output_path) - expected_ids = FileDownloadStat.pluck(:id) - output_ids = output.split("\n").map(&:to_i) - expect(output_ids).to match_array(expected_ids) + # Read and parse the CSV file + csv_data = CSV.read(output_path, headers: true) + output_records = csv_data.map { |row| row.to_h.symbolize_keys } + + expected_records = file_download_stats.map do |stat| + { + id: stat.id.to_s, + date: stat.date.to_s, + downloads: stat.downloads.to_s, + file_id: stat.file_id + } + end + + expect(output_records).to match_array(expected_records) end context 'with an after_timestamp' do @@ -32,14 +42,31 @@ old_stat = FactoryBot.create(:file_download_stat, updated_at: 2.days.ago) service = described_class.new - service.list_object_ids(output_path, after_timestamp) + service.list_record_info(output_path, after_timestamp) + + expect(File).to exist(output_path) + + # Read and parse the CSV file + csv_data = CSV.read(output_path, headers: true) + output_records = csv_data.map { |row| row.to_h.symbolize_keys } - output = File.read(output_path) - expected_ids = recent_stats.pluck(:id) - output_ids = output.split("\n").map(&:to_i) + expected_records = recent_stats.map do |stat| + { + id: stat.id.to_s, + date: stat.date.to_s, + downloads: stat.downloads.to_s, + file_id: stat.file_id + } + end - expect(output_ids).not_to include(old_stat.id) - expect(output_ids).to match_array(expected_ids) + # Ensure old_stat is not included and recent_stats are + expect(output_records).not_to include( + id: old_stat.id.to_s, + date: old_stat.date.to_s, + downloads: old_stat.downloads.to_s, + file_id: old_stat.file_id + ) + expect(output_records).to match_array(expected_records) end end end From 6a1b8ed7b288c9ac39d1d3fd3dc6de44d29487cd Mon Sep 17 00:00:00 2001 From: David Campbell Date: Wed, 21 Aug 2024 14:02:50 -0400 Subject: [PATCH 05/32] truncate by date, update or create stat function --- .../tasks/download_stats_migration_service.rb | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index 36ca1c6ef..c1a3bbe7b 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -5,10 +5,13 @@ class DownloadStatsMigrationService def list_record_info(output_path, after_timestamp = nil) # Build the query - query = FileDownloadStat.select(:id, :date, :downloads, :file_id) + query = FileDownloadStat.select("file_id, DATE_TRUNC('month', date) AS date, SUM(downloads) AS downloads") query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? - # Fetch the IDs in batches + # Group by file_id and truncated month + query = query.group("file_id, DATE_TRUNC('month', date)").order('file_id, month') + + # Fetch the IDs in batches for memory efficiency and performance records = [] query.find_in_batches(batch_size: PAGE_SIZE) do |batch| records.concat(batch.map { |record| { id: record.id, date: record.date, downloads: record.downloads, file_id: record.file_id } }) @@ -22,14 +25,25 @@ def list_record_info(output_path, after_timestamp = nil) end end - def replicate_to_new_table(id_list_file, clean_index) - # Read the list of IDs from the file and strip whitespace - ids = File.readlines(id_list_file).map(&:strip) + def migrate_to_new_table(csv_path) + csv_data = CSV.read(csv_path, headers: true) + records = csv_data.map { |row| row.to_h.symbolize_keys } - # Reindex the objects - ids.each do |id| - hyc_download_stat = HycDownloadStat.find_or_initialize_by(id) + # Recreate or update objects in new table + records.each do |stat| + update_or_create_stat(stat) end end + + def update_or_create_stat(stat) + hyc_download_stat = HycDownloadStat.find_or_initialize_by(id: stat.id.to_s) + hyc_download_stat.assign_attributes( + date: stat.date, + downloads: stat.downloads, + file_id: stat.file_id, + user_id: stat.user_id + ) + hyc_download_stat.save + end end end From fd91dfee9625af66512670ae1b17a0043eb6a329 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Wed, 21 Aug 2024 14:59:09 -0400 Subject: [PATCH 06/32] addressing grouping errors --- .../tasks/download_stats_migration_service.rb | 23 +++--- .../download_stats_migration_service_spec.rb | 78 +++++++++---------- 2 files changed, 48 insertions(+), 53 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index c1a3bbe7b..f5f6e40b9 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -5,22 +5,22 @@ class DownloadStatsMigrationService def list_record_info(output_path, after_timestamp = nil) # Build the query - query = FileDownloadStat.select("file_id, DATE_TRUNC('month', date) AS date, SUM(downloads) AS downloads") + query = FileDownloadStat.select("id, file_id, DATE_TRUNC('month', date) AS date, SUM(downloads) AS downloads") query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? # Group by file_id and truncated month - query = query.group("file_id, DATE_TRUNC('month', date)").order('file_id, month') + query = query.group("id, file_id, date, downloads").order('file_id, date') # Fetch the IDs in batches for memory efficiency and performance records = [] - query.find_in_batches(batch_size: PAGE_SIZE) do |batch| - records.concat(batch.map { |record| { id: record.id, date: record.date, downloads: record.downloads, file_id: record.file_id } }) + query.in_batches(of: PAGE_SIZE, load: true) do |relation| + records.concat(relation.map { |record| { file_id: record.file_id, date: record.date, downloads: record.downloads } }) end # Write the records to the specified CSV file - CSV.open(output_path, 'w', write_headers: true, headers: ['id', 'date', 'downloads', 'file_id']) do |csv| + CSV.open(output_path, 'w', write_headers: true, headers: ['file_id', 'date', 'downloads']) do |csv| records.each do |record| - csv << [record[:id], record[:date], record[:downloads], record[:file_id]] + csv << [record[:file_id], record[:date], record[:downloads]] end end end @@ -36,12 +36,13 @@ def migrate_to_new_table(csv_path) end def update_or_create_stat(stat) - hyc_download_stat = HycDownloadStat.find_or_initialize_by(id: stat.id.to_s) + puts "Inspect stat: #{stat.inspect}" + puts "Stat file_id: #{stat[:file_id]}" + hyc_download_stat = HycDownloadStat.find_or_initialize_by(fileset_id: stat[:file_id].to_s) hyc_download_stat.assign_attributes( - date: stat.date, - downloads: stat.downloads, - file_id: stat.file_id, - user_id: stat.user_id + date: stat[:date], + download_count: stat[:downloads], + fileset_id: stat[:file_id] ) hyc_download_stat.save end diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb index c709af5c0..a30ed59ff 100644 --- a/spec/services/tasks/download_stats_migration_service_spec.rb +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -2,26 +2,22 @@ require 'rails_helper' RSpec.describe Tasks::DownloadStatsMigrationService, type: :service do - describe '#list_record_info' do - let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') } - let!(:file_download_stats) { FactoryBot.create_list(:file_download_stat, 10) } - - before do - # Ensure the output file is removed before each test - File.delete(output_path) if File.exist?(output_path) - end - - it 'writes all records to the output CSV file' do - service = described_class.new - service.list_record_info(output_path, nil) + let!(:file_download_stats) { FactoryBot.create_list(:file_download_stat, 10) } + let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') } + let(:csv_data) { CSV.read(output_path, headers: true) } + let(:output_records) { csv_data.map { |row| row.to_h.symbolize_keys } } + let(:service) { described_class.new } - expect(File).to exist(output_path) - # Read and parse the CSV file - csv_data = CSV.read(output_path, headers: true) - output_records = csv_data.map { |row| row.to_h.symbolize_keys } + before do + # Ensure the output file is removed before each test + File.delete(output_path) if File.exist?(output_path) + end - expected_records = file_download_stats.map do |stat| + describe '#list_record_info' do + # Helper method to convert an array of FileDownloadStat objects to an array of hashes + def expected_records_for(stats) + stats.map do |stat| { id: stat.id.to_s, date: stat.date.to_s, @@ -29,44 +25,42 @@ file_id: stat.file_id } end + end - expect(output_records).to match_array(expected_records) + it 'writes all records to the output CSV file' do + service.list_record_info(output_path, nil) + + expect(File).to exist(output_path) + expect(output_records).to match_array(expected_records_for(file_download_stats)) end context 'with an after_timestamp' do let(:after_timestamp) { 1.day.ago } let!(:recent_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: 1.hour.ago) } + let!(:old_stat) { FactoryBot.create(:file_download_stat, updated_at: 2.days.ago) } it 'filters records by the given timestamp' do - # Create a file_download_stat that should not be included - old_stat = FactoryBot.create(:file_download_stat, updated_at: 2.days.ago) - - service = described_class.new service.list_record_info(output_path, after_timestamp) expect(File).to exist(output_path) + expect(output_records).not_to include(expected_records_for([old_stat]).first) + expect(output_records).to match_array(expected_records_for(recent_stats)) + end + end + end - # Read and parse the CSV file - csv_data = CSV.read(output_path, headers: true) - output_records = csv_data.map { |row| row.to_h.symbolize_keys } - - expected_records = recent_stats.map do |stat| - { - id: stat.id.to_s, - date: stat.date.to_s, - downloads: stat.downloads.to_s, - file_id: stat.file_id - } - end + describe '#migrate_to_new_table' do + before do + service.list_record_info(output_path, nil) + service.migrate_to_new_table(output_path) + end - # Ensure old_stat is not included and recent_stats are - expect(output_records).not_to include( - id: old_stat.id.to_s, - date: old_stat.date.to_s, - downloads: old_stat.downloads.to_s, - file_id: old_stat.file_id - ) - expect(output_records).to match_array(expected_records) + it 'creates new HycDownloadStat records from the CSV file' do + output_records.each do |record| + hyc_download_stat = HycDownloadStat.find(record[:id]) + expect(hyc_download_stat.date.to_s).to eq(record[:date]) + expect(hyc_download_stat.downloads).to eq(record[:downloads].to_i) + expect(hyc_download_stat.file_id).to eq(record[:file_id]) end end end From 143345c666c07db5f4ff433cd81f5d134d91175a Mon Sep 17 00:00:00 2001 From: David Campbell Date: Wed, 21 Aug 2024 15:25:30 -0400 Subject: [PATCH 07/32] check for truncated date in tests --- .../tasks/download_stats_migration_service.rb | 2 +- .../download_stats_migration_service_spec.rb | 20 +++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index f5f6e40b9..08ed9660d 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -9,7 +9,7 @@ def list_record_info(output_path, after_timestamp = nil) query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? # Group by file_id and truncated month - query = query.group("id, file_id, date, downloads").order('file_id, date') + query = query.group('id, file_id, date, downloads').order('file_id, date') # Fetch the IDs in batches for memory efficiency and performance records = [] diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb index a30ed59ff..f3cc23f55 100644 --- a/spec/services/tasks/download_stats_migration_service_spec.rb +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -2,7 +2,19 @@ require 'rails_helper' RSpec.describe Tasks::DownloadStatsMigrationService, type: :service do - let!(:file_download_stats) { FactoryBot.create_list(:file_download_stat, 10) } + let!(:file_download_stats) do + [ + FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 1)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 1)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 1)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 15)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 30)) + ] + end let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') } let(:csv_data) { CSV.read(output_path, headers: true) } let(:output_records) { csv_data.map { |row| row.to_h.symbolize_keys } } @@ -16,13 +28,13 @@ describe '#list_record_info' do # Helper method to convert an array of FileDownloadStat objects to an array of hashes + # Checks for truncated date to the beginning of the month def expected_records_for(stats) stats.map do |stat| { - id: stat.id.to_s, - date: stat.date.to_s, + file_id: stat.file_id, + date: stat.date.beginning_of_month.to_s, downloads: stat.downloads.to_s, - file_id: stat.file_id } end end From cd78526a3eb0763964879a6f93fc443575787619 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Thu, 22 Aug 2024 17:01:36 -0400 Subject: [PATCH 08/32] refactor functions in download analytics behavior into a helper module --- .../hyc/download_analytics_behavior.rb | 45 ++----- app/helpers/work_utils_helper.rb | 19 +++ .../hyrax/downloads_controller_spec.rb | 125 ++++++++++-------- 3 files changed, 98 insertions(+), 91 deletions(-) create mode 100644 app/helpers/work_utils_helper.rb diff --git a/app/controllers/concerns/hyc/download_analytics_behavior.rb b/app/controllers/concerns/hyc/download_analytics_behavior.rb index cf144f3e4..0451751fa 100644 --- a/app/controllers/concerns/hyc/download_analytics_behavior.rb +++ b/app/controllers/concerns/hyc/download_analytics_behavior.rb @@ -41,8 +41,8 @@ def track_download send_image: '0', ua: user_agent, # Recovering work id with a solr query - dimension1: record_id, - dimension2: record_title + dimension1: work_data[:work_id], + dimension2: work_data[:title] } uri.query = URI.encode_www_form(uri_params) response = HTTParty.get(uri.to_s) @@ -58,19 +58,16 @@ def track_download end def create_download_stat - record_id_value = record_id - work_type_value = work_type - admin_set_id_value = admin_set_id date = Date.today Rails.logger.debug('Creating or updating hyc-download-stat database entry with the following attributes:') - Rails.logger.debug("fileset_id: #{fileset_id}, work_id: #{record_id_value}, admin_set_id: #{admin_set_id_value}, work_type: #{work_type_value}, date: #{date.beginning_of_month}") + Rails.logger.debug("fileset_id: #{fileset_id}, work_id: #{work_data[:work_id]}, admin_set_id: #{work_data[:admin_set_id]}, work_type: #{work_data[:work_type]}, date: #{date.beginning_of_month}") stat = HycDownloadStat.find_or_initialize_by( fileset_id: fileset_id, - work_id: record_id_value, - admin_set_id: admin_set_id_value, - work_type: work_type_value, + work_id: work_data[:work_id], + admin_set_id: work_data[:admin_set_id], + work_type: work_data[:work_type], date: date.beginning_of_month ) stat.download_count += 1 @@ -87,38 +84,14 @@ def bot_request?(user_agent) browser.bot? end - def fetch_record - @record ||= ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'] - end - - def fetch_admin_set - @admin_set ||= ActiveFedora::SolrService.get("title_tesim:#{@admin_set_name}", rows: 1)['response']['docs'] - end - - def admin_set_id - @admin_set_id ||= fetch_admin_set.dig(0, 'id') || 'Unknown' - end - - def record_id - @record_id ||= fetch_record.dig(0, 'id') || 'Unknown' - end - - def work_type - @work_type ||= fetch_record.dig(0, 'has_model_ssim', 0) || 'Unknown' - end - def fileset_id @fileset_id ||= params[:id] || 'Unknown' end - def record_title - @record_title ||= if !fetch_record.blank? && fetch_record[0]['title_tesim'] - fetch_record[0]['title_tesim'].first - else - 'Unknown' - end + def work_data + @work_data ||= WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id) end - + def site_id @site_id ||= ENV['MATOMO_SITE_ID'] end diff --git a/app/helpers/work_utils_helper.rb b/app/helpers/work_utils_helper.rb new file mode 100644 index 000000000..84f23fdf2 --- /dev/null +++ b/app/helpers/work_utils_helper.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true +module WorkUtilsHelper + def self.fetch_work_data_by_fileset_id(fileset_id) + work = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {} + raise "No work found for fileset id: #{fileset_id}" if work.blank? + # Fetch the admin set related to the work + admin_set_name = work['admin_set_tesim']&.first || 'Unknown' + admin_set = ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", rows: 1)['response']['docs'].first || {} + + { + work_id: work['id'] || 'Unknown', + work_type: work.dig('has_model_ssim', 0) || 'Unknown', + title: work['title_tesim']&.first || 'Unknown', + admin_set_id: admin_set['id'] || 'Unknown', + admin_set_name: admin_set_name + } + end +end + \ No newline at end of file diff --git a/spec/controllers/hyrax/downloads_controller_spec.rb b/spec/controllers/hyrax/downloads_controller_spec.rb index e80aa4c37..137a292b7 100644 --- a/spec/controllers/hyrax/downloads_controller_spec.rb +++ b/spec/controllers/hyrax/downloads_controller_spec.rb @@ -29,6 +29,13 @@ 'admin_set_tesim' => ['Open_Access_Articles_and_Book_Chapters']} ] } + let(:mock_work_data) { { + work_id: '1z40m031g', + work_type: 'Article', + title: ['Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response'], + admin_set_id: 'h128zk07m', + admin_set_name: 'Open_Access_Articles_and_Book_Chapters' + } } let(:file_set) do FactoryBot.create(:file_with_work, user: user, content: File.open("#{fixture_path}/files/image.png")) end @@ -53,8 +60,7 @@ allow(stub_matomo) @user = user sign_in @user - allow(controller).to receive(:fetch_record).and_return(mock_record) - allow(controller).to receive(:fetch_admin_set).and_return(mock_admin_set) + allow(WorkUtilsHelper).to receive(:fetch_work_data_by_fileset_id).and_return(mock_work_data) allow(Hyrax::Analytics.config).to receive(:site_id).and_return(spec_site_id) allow(SecureRandom).to receive(:uuid).and_return('555') allow(Hyrax::VirusCheckerService).to receive(:file_has_virus?) { false } @@ -201,8 +207,14 @@ context 'fileset without a parent work' do before do - allow(controller).to receive(:fetch_record).and_return([{}]) - allow(controller).to receive(:fetch_admin_set).and_return([{}]) + dummy_work_data = { + work_id: 'Unknown', + work_type: 'Unknown', + title: 'Unknown', + admin_set_id: 'Unknown', + admin_set_name: 'Unknown' + } + allow(WorkUtilsHelper).to receive(:fetch_work_data_by_fileset_id).and_return(dummy_work_data) end it 'records a download event with no work type' do @@ -332,61 +344,64 @@ end end - describe '#fetch_record' do - it 'fetches the record from Solr' do - expect(controller.send(:fetch_record)).to eq(mock_record) - end - end - - describe '#fetch_admin_set' do - it 'fetches the admin set from Solr' do - expect(controller.send(:fetch_admin_set)).to eq(mock_admin_set) - end - end - - describe '#admin_set_id' do - it 'returns the admin set id' do - expect(controller.send(:admin_set_id)).to eq('h128zk07m') - end - end - - describe '#record_id' do - it 'returns the record id' do - expect(controller.send(:record_id)).to eq('1z40m031g') - end - - it 'returns Unknown if the record is blank' do - allow(controller).to receive(:fetch_record).and_return([]) - expect(controller.send(:record_id)).to eq('Unknown') - end - end - - describe '#fileset_id' do - it 'returns the fileset id from params' do - controller.params = { id: file_set.id } - expect(controller.send(:fileset_id)).to eq(file_set.id) - end - - it 'returns Unknown if params id is missing' do - controller.params = {} - expect(controller.send(:fileset_id)).to eq('Unknown') - end - end - - describe '#record_title' do - it 'returns the record title' do - expect(controller.send(:record_title)).to eq('Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response') - end - - it 'returns Unknown if the record title is blank' do - allow(controller).to receive(:fetch_record).and_return([{ 'title_tesim' => nil }]) - expect(controller.send(:record_title)).to eq('Unknown') - end - end + # describe '#fetch_record' do + # it 'fetches the record from Solr' do + # expect(controller.send(:fetch_record)).to eq(mock_record) + # end + # end + + # describe '#fetch_admin_set' do + # it 'fetches the admin set from Solr' do + # expect(controller.send(:fetch_admin_set)).to eq(mock_admin_set) + # end + # end + + # describe '#admin_set_id' do + # it 'returns the admin set id' do + # expect(controller.send(:admin_set_id)).to eq('h128zk07m') + # end + # end + + # describe '#record_id' do + # it 'returns the record id' do + # expect(controller.send(:record_id)).to eq('1z40m031g') + # end + + # it 'returns Unknown if the record is blank' do + # # allow(controller).to receive(:fetch_record).and_return([]) + # expect(controller.send(:record_id)).to eq('Unknown') + # end + # end + + # describe '#fileset_id' do + # it 'returns the fileset id from params' do + # controller.params = { id: file_set.id } + # expect(controller.send(:fileset_id)).to eq(file_set.id) + # end + + # it 'returns Unknown if params id is missing' do + # controller.params = {} + # expect(controller.send(:fileset_id)).to eq('Unknown') + # end + # end + + # describe '#record_title' do + # it 'returns the record title' do + # expect(controller.send(:record_title)).to eq('Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response') + # end + + # it 'returns Unknown if the record title is blank' do + # # allow(controller).to receive(:fetch_record).and_return([{ 'title_tesim' => nil }]) + # expect(controller.send(:record_title)).to eq('Unknown') + # end + # end describe '#site_id' do it 'returns the site id from ENV' do expect(controller.send(:site_id)).to eq('5') end + # it 'passes this simple test' do + # expect(true).to eq(true) + # end end end From 345b910f49e09df30f3f8d948514b9f52e35188a Mon Sep 17 00:00:00 2001 From: David Campbell Date: Thu, 22 Aug 2024 17:34:47 -0400 Subject: [PATCH 09/32] linting, tests for utility helper and removing comments --- .../hyc/download_analytics_behavior.rb | 2 +- app/helpers/work_utils_helper.rb | 29 ++++--- .../hyrax/downloads_controller_spec.rb | 58 +------------- spec/helpers/hyrax/work_utils_helper_spec.rb | 79 +++++++++++++++++++ 4 files changed, 96 insertions(+), 72 deletions(-) create mode 100644 spec/helpers/hyrax/work_utils_helper_spec.rb diff --git a/app/controllers/concerns/hyc/download_analytics_behavior.rb b/app/controllers/concerns/hyc/download_analytics_behavior.rb index 0451751fa..f2c555b97 100644 --- a/app/controllers/concerns/hyc/download_analytics_behavior.rb +++ b/app/controllers/concerns/hyc/download_analytics_behavior.rb @@ -91,7 +91,7 @@ def fileset_id def work_data @work_data ||= WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id) end - + def site_id @site_id ||= ENV['MATOMO_SITE_ID'] end diff --git a/app/helpers/work_utils_helper.rb b/app/helpers/work_utils_helper.rb index 84f23fdf2..a92119f63 100644 --- a/app/helpers/work_utils_helper.rb +++ b/app/helpers/work_utils_helper.rb @@ -1,19 +1,18 @@ # frozen_string_literal: true module WorkUtilsHelper - def self.fetch_work_data_by_fileset_id(fileset_id) - work = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {} - raise "No work found for fileset id: #{fileset_id}" if work.blank? - # Fetch the admin set related to the work - admin_set_name = work['admin_set_tesim']&.first || 'Unknown' - admin_set = ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", rows: 1)['response']['docs'].first || {} + def self.fetch_work_data_by_fileset_id(fileset_id) + work = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {} + raise "No work found for fileset id: #{fileset_id}" if work.blank? + # Fetch the admin set related to the work + admin_set_name = work['admin_set_tesim']&.first || 'Unknown' + admin_set = ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", rows: 1)['response']['docs'].first || {} - { - work_id: work['id'] || 'Unknown', - work_type: work.dig('has_model_ssim', 0) || 'Unknown', - title: work['title_tesim']&.first || 'Unknown', - admin_set_id: admin_set['id'] || 'Unknown', - admin_set_name: admin_set_name - } - end + { + work_id: work['id'] || 'Unknown', + work_type: work.dig('has_model_ssim', 0) || 'Unknown', + title: work['title_tesim']&.first || 'Unknown', + admin_set_id: admin_set['id'] || 'Unknown', + admin_set_name: admin_set_name + } + end end - \ No newline at end of file diff --git a/spec/controllers/hyrax/downloads_controller_spec.rb b/spec/controllers/hyrax/downloads_controller_spec.rb index 137a292b7..7720af709 100644 --- a/spec/controllers/hyrax/downloads_controller_spec.rb +++ b/spec/controllers/hyrax/downloads_controller_spec.rb @@ -35,7 +35,8 @@ title: ['Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response'], admin_set_id: 'h128zk07m', admin_set_name: 'Open_Access_Articles_and_Book_Chapters' - } } + } + } let(:file_set) do FactoryBot.create(:file_with_work, user: user, content: File.open("#{fixture_path}/files/image.png")) end @@ -344,64 +345,9 @@ end end - # describe '#fetch_record' do - # it 'fetches the record from Solr' do - # expect(controller.send(:fetch_record)).to eq(mock_record) - # end - # end - - # describe '#fetch_admin_set' do - # it 'fetches the admin set from Solr' do - # expect(controller.send(:fetch_admin_set)).to eq(mock_admin_set) - # end - # end - - # describe '#admin_set_id' do - # it 'returns the admin set id' do - # expect(controller.send(:admin_set_id)).to eq('h128zk07m') - # end - # end - - # describe '#record_id' do - # it 'returns the record id' do - # expect(controller.send(:record_id)).to eq('1z40m031g') - # end - - # it 'returns Unknown if the record is blank' do - # # allow(controller).to receive(:fetch_record).and_return([]) - # expect(controller.send(:record_id)).to eq('Unknown') - # end - # end - - # describe '#fileset_id' do - # it 'returns the fileset id from params' do - # controller.params = { id: file_set.id } - # expect(controller.send(:fileset_id)).to eq(file_set.id) - # end - - # it 'returns Unknown if params id is missing' do - # controller.params = {} - # expect(controller.send(:fileset_id)).to eq('Unknown') - # end - # end - - # describe '#record_title' do - # it 'returns the record title' do - # expect(controller.send(:record_title)).to eq('Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response') - # end - - # it 'returns Unknown if the record title is blank' do - # # allow(controller).to receive(:fetch_record).and_return([{ 'title_tesim' => nil }]) - # expect(controller.send(:record_title)).to eq('Unknown') - # end - # end - describe '#site_id' do it 'returns the site id from ENV' do expect(controller.send(:site_id)).to eq('5') end - # it 'passes this simple test' do - # expect(true).to eq(true) - # end end end diff --git a/spec/helpers/hyrax/work_utils_helper_spec.rb b/spec/helpers/hyrax/work_utils_helper_spec.rb new file mode 100644 index 000000000..f2e9c62d8 --- /dev/null +++ b/spec/helpers/hyrax/work_utils_helper_spec.rb @@ -0,0 +1,79 @@ +# frozen_string_literal: true +require 'rails_helper' +require Rails.root.join('app/overrides/controllers/hydra/controller/download_behavior_override.rb') +require Rails.root.join('app/overrides/controllers/hyrax/downloads_controller_override.rb') + +RSpec.describe WorkUtilsHelper, type: :module do + let(:fileset_id) { 'file-set-id' } + let(:admin_set_name) { 'Open_Access_Articles_and_Book_Chapters' } + let(:example_admin_set_id) { 'h128zk07m' } + let(:example_work_id) { '1z40m031g' } + + let(:mock_record) { [{ + 'has_model_ssim' => ['Article'], + 'id' => '1z40m031g', + 'title_tesim' => ['Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response'], + 'admin_set_tesim' => ['Open_Access_Articles_and_Book_Chapters']} + ] + } + + let(:mock_admin_set) { [{ + 'has_model_ssim' => ['AdminSet'], + 'id' => 'h128zk07m', + 'title_tesim' => ['Open_Access_Articles_and_Book_Chapters']} + ] + } + + let(:expected_work_data) { { + work_id: '1z40m031g', + work_type: 'Article', + title: 'Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response', + admin_set_id: 'h128zk07m', + admin_set_name: 'Open_Access_Articles_and_Book_Chapters' + } + } + + before do + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_id}", rows: 1).and_return('response' => { 'docs' => mock_record }) + allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_name}", rows: 1).and_return('response' => { 'docs' => mock_admin_set }) + end + + describe '#fetch_work_data_by_fileset_id' do + it 'fetches the work data correctly' do + result = WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id) + expect(result).to eq(expected_work_data) + end + + it 'properly substitutes Unknown for missing values' do + # Mock the solr response to simulate a work with missing values, if it somehow makes it past the initial nil check + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_id}", rows: 1).and_return('response' => { 'docs' => [{ 'placeholder-key' => 'placeholder-value' }] }) + allow(ActiveFedora::SolrService).to receive(:get).with('title_tesim:Unknown', rows: 1).and_return('response' => { 'docs' => [] }) + result = WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id) + expect(result[:work_id]).to eq('Unknown') + expect(result[:work_type]).to eq('Unknown') + expect(result[:title]).to eq('Unknown') + expect(result[:admin_set_id]).to eq('Unknown') + end + + context 'when no work is found' do + before do + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_id}", rows: 1).and_return('response' => { 'docs' => [] }) + end + + it 'raises an error if no work is found' do + expect { WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id) }.to raise_error(RuntimeError, "No work found for fileset id: #{fileset_id}") + end + end + + context 'when admin set is not found' do + before do + allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_name}", rows: 1).and_return('response' => { 'docs' => [] }) + end + + it 'sets the admin_set_id to Unknown if admin set is not found' do + result = WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id) + expect(result[:admin_set_id]).to eq('Unknown') + end + end + end +end From 7d5572d38c25e7bad0da5688894747c62773d63b Mon Sep 17 00:00:00 2001 From: David Campbell Date: Fri, 23 Aug 2024 11:57:42 -0400 Subject: [PATCH 10/32] factory for solr query results, adding functionality to migration service --- .../tasks/download_stats_migration_service.rb | 16 +++++++-- spec/factories/solr_query_result.rb | 36 +++++++++++++++++++ .../download_stats_migration_service_spec.rb | 17 ++++++++- 3 files changed, 65 insertions(+), 4 deletions(-) create mode 100644 spec/factories/solr_query_result.rb diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index 08ed9660d..c3a74a237 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -31,20 +31,30 @@ def migrate_to_new_table(csv_path) # Recreate or update objects in new table records.each do |stat| - update_or_create_stat(stat) + create_hyc_download_stat(stat) end end - def update_or_create_stat(stat) + def create_hyc_download_stat(stat) puts "Inspect stat: #{stat.inspect}" puts "Stat file_id: #{stat[:file_id]}" hyc_download_stat = HycDownloadStat.find_or_initialize_by(fileset_id: stat[:file_id].to_s) + work_data = work_data_from_stat(stat) hyc_download_stat.assign_attributes( + fileset_id: stat[:file_id], + work_id: work_data[:work_id], + admin_set_id: work_data[:admin_set_id], + work_type: work_data[:work_type], date: stat[:date], download_count: stat[:downloads], - fileset_id: stat[:file_id] ) hyc_download_stat.save end + + # Similar implementation to work_data in DownloadAnalyticsBehavior + # Memoization is not necessary here since this method is called per stat + def work_data_from_stat(stat) + WorkUtilsHelper.fetch_work_data_by_fileset_id(stat[:file_id]) + end end end diff --git a/spec/factories/solr_query_result.rb b/spec/factories/solr_query_result.rb new file mode 100644 index 000000000..e17445ceb --- /dev/null +++ b/spec/factories/solr_query_result.rb @@ -0,0 +1,36 @@ +# spec/factories/solr_query_result.rb +# frozen_string_literal: true + +# Helper method to generate work or admin IDs like '0001abc', '0002abc', etc. +def generate_id(n) + "#{n.to_s.rjust(4, '0')}abc" +end + +# Factory for creating Solr query results +FactoryBot.define do + factory :solr_query_result, class: OpenStruct do + + # General attributes that could be shared between works and admin sets + # sequence(:admin_set_id) { |n| "admin_set_id_#{n}" } + # sequence(:admin_set_name) { |n| "Admin Set #{n}" } + + trait :work do + # Default values for has_model_ssim, admin_set_tesim, and file_set_ids_ssim + has_model_ssim { ['Article'] } + admin_set_tesim { ['Open_Access_Articles_and_Book_Chapters'] } + file_set_ids_ssim { ['file_set_id'] } + sequence(:id) { |n| generate_id(n) } + sequence(:title_tesim) { |n| ["Test Title #{n}"] } + end + + trait :admin_set do + # Default values for has_model_ssim and title_tesim + has_model_ssim { ['AdminSet'] } + title_tesim { ['Open_Access_Articles_and_Book_Chapters'] } + sequence(:id) { |n| generate_id(n) } + end + + # Override the default save behavior to do nothing since it's a non-ActiveRecord object + to_create { |instance| instance } + end +end diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb index f3cc23f55..81b1730a2 100644 --- a/spec/services/tasks/download_stats_migration_service_spec.rb +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -15,6 +15,10 @@ FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 30)) ] end + let(:admin_set_title) { 'Open_Access_Articles_and_Book_Chapters' } + let!(:mock_admin_set) { FactoryBot.create(:solr_query_result) } + # Generate works for the each file_download_stat + let!(:mock_works) { file_download_stats.map { |stat| FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) } } let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') } let(:csv_data) { CSV.read(output_path, headers: true) } let(:output_records) { csv_data.map { |row| row.to_h.symbolize_keys } } @@ -24,6 +28,12 @@ before do # Ensure the output file is removed before each test File.delete(output_path) if File.exist?(output_path) + allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_title}", rows: 1).and_return('response' => { 'docs' => [mock_admin_set] }) + file_download_stats.each_with_index do |stat, index| + # Assign a random number of downloads to each stat + puts "Stat file_id: #{stat.file_id}" + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] }) + end end describe '#list_record_info' do @@ -68,7 +78,12 @@ def expected_records_for(stats) end it 'creates new HycDownloadStat records from the CSV file' do - output_records.each do |record| + output_records.each_with_index do |record,index| + # WIP: Break at index 5 + if index == 5 + break + end + puts "Record: #{record.inspect}" hyc_download_stat = HycDownloadStat.find(record[:id]) expect(hyc_download_stat.date.to_s).to eq(record[:date]) expect(hyc_download_stat.downloads).to eq(record[:downloads].to_i) From b891812099eb8591e3771f54e9e94485ac27b83b Mon Sep 17 00:00:00 2001 From: David Campbell Date: Fri, 23 Aug 2024 13:29:11 -0400 Subject: [PATCH 11/32] helper functions for test class, handling truncation outside of query to get around group by errors --- .../tasks/download_stats_migration_service.rb | 54 ++++++-- .../download_stats_migration_service_spec.rb | 125 +++++++++++------- 2 files changed, 124 insertions(+), 55 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index c3a74a237..9d32c4d36 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -4,22 +4,37 @@ class DownloadStatsMigrationService PAGE_SIZE = 1000 def list_record_info(output_path, after_timestamp = nil) - # Build the query - query = FileDownloadStat.select("id, file_id, DATE_TRUNC('month', date) AS date, SUM(downloads) AS downloads") + # # Build the query + # query = FileDownloadStat.select("file_id, DATE_TRUNC('month', date) AS date, SUM(downloads) AS downloads") + # query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? + + # # Group by file_id and truncated month + # query = query.group('file_id, DATE_TRUNC(\'month\', date)').order('file_id, date') + query = FileDownloadStat.all query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? - # Group by file_id and truncated month - query = query.group('id, file_id, date, downloads').order('file_id, date') + # # Fetch the IDs in batches for memory efficiency and performance + # records = [] + # query.in_batches(of: PAGE_SIZE, load: true) do |relation| + # records.concat(relation.map { |record| { file_id: record.file_id, date: record.date, downloads: record.downloads } }) + # end - # Fetch the IDs in batches for memory efficiency and performance + # Fetch the records in batches to handle large datasets records = [] - query.in_batches(of: PAGE_SIZE, load: true) do |relation| - records.concat(relation.map { |record| { file_id: record.file_id, date: record.date, downloads: record.downloads } }) + query.find_each(batch_size: PAGE_SIZE) do |record| + records << { + file_id: record.file_id, + date: record.date, + downloads: record.downloads + } end + # Perform aggregation in Ruby + aggregated_records = aggregate_downloads(records) + # Write the records to the specified CSV file CSV.open(output_path, 'w', write_headers: true, headers: ['file_id', 'date', 'downloads']) do |csv| - records.each do |record| + aggregated_records.each do |record| csv << [record[:file_id], record[:date], record[:downloads]] end end @@ -38,7 +53,10 @@ def migrate_to_new_table(csv_path) def create_hyc_download_stat(stat) puts "Inspect stat: #{stat.inspect}" puts "Stat file_id: #{stat[:file_id]}" - hyc_download_stat = HycDownloadStat.find_or_initialize_by(fileset_id: stat[:file_id].to_s) + hyc_download_stat = HycDownloadStat.find_or_initialize_by( + fileset_id: stat[:file_id].to_s, + date: stat[:date] + ) work_data = work_data_from_stat(stat) hyc_download_stat.assign_attributes( fileset_id: stat[:file_id], @@ -56,5 +74,23 @@ def create_hyc_download_stat(stat) def work_data_from_stat(stat) WorkUtilsHelper.fetch_work_data_by_fileset_id(stat[:file_id]) end + + def aggregate_downloads(records) + aggregated_data = {} + + records.each do |record| + file_id = record[:file_id] + truncated_date = record[:date].beginning_of_month + downloads = record[:downloads] + + # Group the file_id and truncated date to be used as a key + key = [file_id, truncated_date] + # Initialize the hash for the key if it doesn't exist + aggregated_data[key] ||= { file_id: file_id, date: truncated_date, downloads: 0 } + # Sum the downloads for each key + aggregated_data[key][:downloads] += downloads + end + aggregated_data.values + end end end diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb index 81b1730a2..8029c1bbd 100644 --- a/spec/services/tasks/download_stats_migration_service_spec.rb +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -2,50 +2,35 @@ require 'rails_helper' RSpec.describe Tasks::DownloadStatsMigrationService, type: :service do - let!(:file_download_stats) do - [ - FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 1)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 1)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 1)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 15)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 30)) - ] - end let(:admin_set_title) { 'Open_Access_Articles_and_Book_Chapters' } - let!(:mock_admin_set) { FactoryBot.create(:solr_query_result) } - # Generate works for the each file_download_stat - let!(:mock_works) { file_download_stats.map { |stat| FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) } } + let(:mock_admin_set) { FactoryBot.create(:solr_query_result, :admin_set, title_tesim: [admin_set_title]) } let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') } - let(:csv_data) { CSV.read(output_path, headers: true) } - let(:output_records) { csv_data.map { |row| row.to_h.symbolize_keys } } + # let(:csv_data) { CSV.read(output_path, headers: true) } + # let(:output_records) { csv_data.map { |row| row.to_h.symbolize_keys } } let(:service) { described_class.new } - before do # Ensure the output file is removed before each test File.delete(output_path) if File.exist?(output_path) allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_title}", rows: 1).and_return('response' => { 'docs' => [mock_admin_set] }) - file_download_stats.each_with_index do |stat, index| - # Assign a random number of downloads to each stat - puts "Stat file_id: #{stat.file_id}" - allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] }) - end end describe '#list_record_info' do - # Helper method to convert an array of FileDownloadStat objects to an array of hashes - # Checks for truncated date to the beginning of the month - def expected_records_for(stats) - stats.map do |stat| - { - file_id: stat.file_id, - date: stat.date.beginning_of_month.to_s, - downloads: stat.downloads.to_s, - } + before do + @file_download_stats = [ + FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 1)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 1)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 1)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 15)), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 30)) + ] + @mock_works = @file_download_stats.map { |stat| FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) } + @file_download_stats.each_with_index do |stat, index| + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [@mock_works[index]] }) end end @@ -53,7 +38,7 @@ def expected_records_for(stats) service.list_record_info(output_path, nil) expect(File).to exist(output_path) - expect(output_records).to match_array(expected_records_for(file_download_stats)) + expect(csv_to_hash_array(output_path)).to match_array(expected_records_for(@file_download_stats)) end context 'with an after_timestamp' do @@ -65,30 +50,78 @@ def expected_records_for(stats) service.list_record_info(output_path, after_timestamp) expect(File).to exist(output_path) - expect(output_records).not_to include(expected_records_for([old_stat]).first) - expect(output_records).to match_array(expected_records_for(recent_stats)) + expect(csv_to_hash_array(output_path)).not_to include(expected_records_for([old_stat]).first) + expect(csv_to_hash_array(output_path)).to match_array(expected_records_for(recent_stats)) end end end describe '#migrate_to_new_table' do before do + # Smaller groups to enable easier testing for aggregation of download stats from daily to monthly + @file_download_stats = [[ + FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15), downloads: 10, file_id: 'file_id_1'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30), downloads: 10, file_id: 'file_id_1'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 15), downloads: 10, file_id: 'file_id_1'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 30), downloads: 10, file_id: 'file_id_1'), + ], + [ + FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 15), downloads: 10, file_id: 'file_id_2'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 30), downloads: 10, file_id: 'file_id_2'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 15), downloads: 10, file_id: 'file_id_2'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 30), downloads: 10, file_id: 'file_id_2'), + ], + [ + FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15), downloads: 10, file_id: 'file_id_3'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30), downloads: 10, file_id: 'file_id_3'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 15), downloads: 10, file_id: 'file_id_3'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 30), downloads: 10, file_id: 'file_id_3'), + ]] + @mock_works = @file_download_stats.flatten.map do |stat| + FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) + end + @file_download_stats.flatten.each_with_index do |stat, index| + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [@mock_works[index]] }) + end service.list_record_info(output_path, nil) service.migrate_to_new_table(output_path) end it 'creates new HycDownloadStat records from the CSV file' do - output_records.each_with_index do |record,index| - # WIP: Break at index 5 - if index == 5 - break - end - puts "Record: #{record.inspect}" - hyc_download_stat = HycDownloadStat.find(record[:id]) - expect(hyc_download_stat.date.to_s).to eq(record[:date]) - expect(hyc_download_stat.downloads).to eq(record[:downloads].to_i) - expect(hyc_download_stat.file_id).to eq(record[:file_id]) + csv_to_hash_array(output_path).each_with_index do |record, index| + # puts 'All HycDownloadStat records:' + # HycDownloadStat.all.each do |stat| + # puts "ID: #{stat.id}, File ID: #{stat.fileset_id}, Date: #{stat.date}, Downloads: #{stat.download_count}" + # end + # puts "Record: #{record.inspect}" + # hyc_download_stat = HycDownloadStat.find(record[:id]) + work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(record[:file_id]) + hyc_download_stat = HycDownloadStat.find_by(fileset_id: record[:file_id], date: record[:date].to_date.beginning_of_month) + + expect(hyc_download_stat).to be_present + expect(hyc_download_stat.fileset_id).to eq(record[:file_id]) + expect(hyc_download_stat.work_id).to eq(work_data[:work_id]) + expect(hyc_download_stat.date).to eq(record[:date].to_date) + # Each mocked record has 10 downloads per month, so the download count should be 20 + expect(hyc_download_stat.download_count).to eq(20) end end end + + private + def csv_to_hash_array(file_path) + CSV.read(file_path, headers: true).map { |row| row.to_h.symbolize_keys } + end + + # Helper method to convert an array of FileDownloadStat objects to an array of hashes + # Checks for truncated date to the beginning of the month + def expected_records_for(stats) + stats.map do |stat| + { + file_id: stat.file_id, + date: stat.date.beginning_of_month.to_s, + downloads: stat.downloads.to_s, + } + end + end end From 4f62fb624573e2e9ebe9972c6e5fed74a574b43e Mon Sep 17 00:00:00 2001 From: David Campbell Date: Fri, 23 Aug 2024 14:42:57 -0400 Subject: [PATCH 12/32] cleaning up test file --- .../download_stats_migration_service_spec.rb | 125 +++++++++--------- 1 file changed, 66 insertions(+), 59 deletions(-) diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb index 8029c1bbd..abe9a4791 100644 --- a/spec/services/tasks/download_stats_migration_service_spec.rb +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -5,83 +5,96 @@ let(:admin_set_title) { 'Open_Access_Articles_and_Book_Chapters' } let(:mock_admin_set) { FactoryBot.create(:solr_query_result, :admin_set, title_tesim: [admin_set_title]) } let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') } - # let(:csv_data) { CSV.read(output_path, headers: true) } - # let(:output_records) { csv_data.map { |row| row.to_h.symbolize_keys } } let(:service) { described_class.new } before do - # Ensure the output file is removed before each test - File.delete(output_path) if File.exist?(output_path) allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_title}", rows: 1).and_return('response' => { 'docs' => [mock_admin_set] }) end - describe '#list_record_info' do - before do - @file_download_stats = [ - FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 1)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 1)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 1)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 15)), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 12, 30)) - ] - @mock_works = @file_download_stats.map { |stat| FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) } - @file_download_stats.each_with_index do |stat, index| - allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [@mock_works[index]] }) - end + after do + # Ensure the output file is removed after each test + File.delete(output_path) if File.exist?(output_path) + end + + # Smaller groups to allow for easier testing for aggregation of download stats from daily to monthly + let(:file_download_stats) { [[ + FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15), downloads: 10, file_id: 'file_id_1'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30), downloads: 10, file_id: 'file_id_1'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 15), downloads: 10, file_id: 'file_id_1'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 30), downloads: 10, file_id: 'file_id_1'), + ], + [ + FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 15), downloads: 10, file_id: 'file_id_2'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 30), downloads: 10, file_id: 'file_id_2'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 15), downloads: 10, file_id: 'file_id_2'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 30), downloads: 10, file_id: 'file_id_2'), + ], + [ + FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15), downloads: 10, file_id: 'file_id_3'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30), downloads: 10, file_id: 'file_id_3'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 15), downloads: 10, file_id: 'file_id_3'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 30), downloads: 10, file_id: 'file_id_3'), + ]] + } + + let(:mock_works) do + file_download_stats.flatten.map do |stat| + FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) end + end + describe '#list_record_info' do it 'writes all records to the output CSV file' do + file_download_stats.flatten.each_with_index do |stat, index| + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] }) + end + + expected_records = [ + { file_id: 'file_id_1', date: '2023-01-01 00:00:00 UTC', downloads: '20' }, + { file_id: 'file_id_1', date: '2023-03-01 00:00:00 UTC', downloads: '20' }, + { file_id: 'file_id_2', date: '2023-04-01 00:00:00 UTC', downloads: '20' }, + { file_id: 'file_id_2', date: '2023-05-01 00:00:00 UTC', downloads: '20' }, + { file_id: 'file_id_3', date: '2023-06-01 00:00:00 UTC', downloads: '20' }, + { file_id: 'file_id_3', date: '2023-07-01 00:00:00 UTC', downloads: '20' } + ] service.list_record_info(output_path, nil) expect(File).to exist(output_path) - expect(csv_to_hash_array(output_path)).to match_array(expected_records_for(@file_download_stats)) + expect(csv_to_hash_array(output_path)).to match_array(expected_records) end context 'with an after_timestamp' do - let(:after_timestamp) { 1.day.ago } - let!(:recent_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: 1.hour.ago) } - let!(:old_stat) { FactoryBot.create(:file_download_stat, updated_at: 2.days.ago) } + let(:recent_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: '2023-05-05 00:00:00 UTC') } + let(:old_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: '2023-04-05 00:00:00 UTC') } + let(:recent_stat_file_ids) { recent_stats.map(&:file_id) } + let(:old_stat_file_ids) { old_stats.map(&:file_id) } + + before do + all_stats = recent_stats + old_stats + all_works = all_stats.map do |stat| + FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) + end + all_stats.each_with_index do |stat, index| + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [all_works[index]] }) + end + end it 'filters records by the given timestamp' do - service.list_record_info(output_path, after_timestamp) + # Retrieve records created after 'updated_at' date for old stats + service.list_record_info(output_path, '2023-04-06 00:00:00 UTC') + puts "CSV data: #{csv_to_hash_array(output_path).inspect}" expect(File).to exist(output_path) - expect(csv_to_hash_array(output_path)).not_to include(expected_records_for([old_stat]).first) - expect(csv_to_hash_array(output_path)).to match_array(expected_records_for(recent_stats)) + expect(csv_to_hash_array(output_path).map { |record| record[:file_id] }).to match_array(recent_stat_file_ids) + expect(csv_to_hash_array(output_path).map { |record| record[:file_id] }).not_to include(*old_stat_file_ids) end end end describe '#migrate_to_new_table' do before do - # Smaller groups to enable easier testing for aggregation of download stats from daily to monthly - @file_download_stats = [[ - FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15), downloads: 10, file_id: 'file_id_1'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30), downloads: 10, file_id: 'file_id_1'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 15), downloads: 10, file_id: 'file_id_1'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 30), downloads: 10, file_id: 'file_id_1'), - ], - [ - FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 15), downloads: 10, file_id: 'file_id_2'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 30), downloads: 10, file_id: 'file_id_2'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 15), downloads: 10, file_id: 'file_id_2'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 30), downloads: 10, file_id: 'file_id_2'), - ], - [ - FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15), downloads: 10, file_id: 'file_id_3'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30), downloads: 10, file_id: 'file_id_3'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 15), downloads: 10, file_id: 'file_id_3'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 30), downloads: 10, file_id: 'file_id_3'), - ]] - @mock_works = @file_download_stats.flatten.map do |stat| - FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) - end - @file_download_stats.flatten.each_with_index do |stat, index| - allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [@mock_works[index]] }) + file_download_stats.flatten.each_with_index do |stat, index| + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] }) end service.list_record_info(output_path, nil) service.migrate_to_new_table(output_path) @@ -89,15 +102,9 @@ it 'creates new HycDownloadStat records from the CSV file' do csv_to_hash_array(output_path).each_with_index do |record, index| - # puts 'All HycDownloadStat records:' - # HycDownloadStat.all.each do |stat| - # puts "ID: #{stat.id}, File ID: #{stat.fileset_id}, Date: #{stat.date}, Downloads: #{stat.download_count}" - # end - # puts "Record: #{record.inspect}" - # hyc_download_stat = HycDownloadStat.find(record[:id]) work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(record[:file_id]) hyc_download_stat = HycDownloadStat.find_by(fileset_id: record[:file_id], date: record[:date].to_date.beginning_of_month) - + expect(hyc_download_stat).to be_present expect(hyc_download_stat.fileset_id).to eq(record[:file_id]) expect(hyc_download_stat.work_id).to eq(work_data[:work_id]) From 14325abbfd81691cc46ca624058bf326b4561e44 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Fri, 23 Aug 2024 14:57:02 -0400 Subject: [PATCH 13/32] add task to rake file --- .../tasks/download_stats_migration_service.rb | 17 +---------- lib/tasks/migrate_download_stats.rake | 29 +++++++++++++++++-- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index 9d32c4d36..ff46391b1 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -3,22 +3,9 @@ module Tasks class DownloadStatsMigrationService PAGE_SIZE = 1000 def list_record_info(output_path, after_timestamp = nil) - - # # Build the query - # query = FileDownloadStat.select("file_id, DATE_TRUNC('month', date) AS date, SUM(downloads) AS downloads") - # query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? - - # # Group by file_id and truncated month - # query = query.group('file_id, DATE_TRUNC(\'month\', date)').order('file_id, date') query = FileDownloadStat.all query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? - # # Fetch the IDs in batches for memory efficiency and performance - # records = [] - # query.in_batches(of: PAGE_SIZE, load: true) do |relation| - # records.concat(relation.map { |record| { file_id: record.file_id, date: record.date, downloads: record.downloads } }) - # end - # Fetch the records in batches to handle large datasets records = [] query.find_each(batch_size: PAGE_SIZE) do |record| @@ -29,7 +16,7 @@ def list_record_info(output_path, after_timestamp = nil) } end - # Perform aggregation in Ruby + # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries aggregated_records = aggregate_downloads(records) # Write the records to the specified CSV file @@ -51,8 +38,6 @@ def migrate_to_new_table(csv_path) end def create_hyc_download_stat(stat) - puts "Inspect stat: #{stat.inspect}" - puts "Stat file_id: #{stat[:file_id]}" hyc_download_stat = HycDownloadStat.find_or_initialize_by( fileset_id: stat[:file_id].to_s, date: stat[:date] diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake index 09926a995..e899d732a 100644 --- a/lib/tasks/migrate_download_stats.rake +++ b/lib/tasks/migrate_download_stats.rake @@ -21,11 +21,34 @@ namespace :migrate_download_stats do puts 'Please provide a valid output directory with a .csv extension' exit 1 end + + migration_service = Tasks::DownloadStatsMigrationService.new + old_stats_csv = migration_service.list_record_info(options[:output_dir], options[:after]) + puts "Listing completed in #{Time.now - start_time}s" + puts "Stored id list to file: #{options[:output_dir]}" + exit 0 + end - file_path = Tasks::DownloadStatsMigrationService.new.list_record_info(options[:output_dir], options[:after]) + desc 'migrate download stats to new table' + task :migrate, [:csv_path] => :environment do |_t, _args| + start_time = Time.now + puts "[#{start_time.utc.iso8601}] Starting migration from CSV to new table" + options = {} - puts "Listing completed in #{Time.now - start_time}s" - puts "Stored id list to file: #{file_path}" + opts = OptionParser.new + opts.banner = 'Usage: bundle exec rake migrate_download_stats:migrate -- [options]' + opts.on('-c', '--csv-path ARG', String, 'Path to the CSV file to migrate') { |val| options[:csv_path] = val } + args = opts.order!(ARGV) {} + opts.parse!(args) + + unless options[:csv_path].present? && File.exist?(options[:csv_path]) + puts 'Please provide a valid CSV file path' + exit 1 + end + + migration_service = Tasks::DownloadStatsMigrationService.new + migration_service.migrate_to_new_table(options[:csv_path]) + puts "Migration completed in #{Time.now - start_time}s" exit 0 end end From 22f809b991c60e50a4437ef88f94249c494605c2 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Fri, 23 Aug 2024 14:57:23 -0400 Subject: [PATCH 14/32] rubocop --- lib/tasks/migrate_download_stats.rake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake index e899d732a..70ef4e51b 100644 --- a/lib/tasks/migrate_download_stats.rake +++ b/lib/tasks/migrate_download_stats.rake @@ -21,7 +21,7 @@ namespace :migrate_download_stats do puts 'Please provide a valid output directory with a .csv extension' exit 1 end - + migration_service = Tasks::DownloadStatsMigrationService.new old_stats_csv = migration_service.list_record_info(options[:output_dir], options[:after]) puts "Listing completed in #{Time.now - start_time}s" From b738beb7f24a7cacc3e18db11d54eacedea7e5be Mon Sep 17 00:00:00 2001 From: David Campbell Date: Fri, 23 Aug 2024 15:16:57 -0400 Subject: [PATCH 15/32] error message change --- lib/tasks/migrate_download_stats.rake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake index 70ef4e51b..2653fb5d7 100644 --- a/lib/tasks/migrate_download_stats.rake +++ b/lib/tasks/migrate_download_stats.rake @@ -18,7 +18,7 @@ namespace :migrate_download_stats do opts.parse!(args) unless options[:output_dir].present? && options[:output_dir][-4] == '.csv' - puts 'Please provide a valid output directory with a .csv extension' + puts 'Please provide a valid output directory with a .csv extension. Got ' + options[:output_dir].to_s exit 1 end From 4f995e431f96a291f7be9c87d7751448f1d7e7c2 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Fri, 23 Aug 2024 15:31:26 -0400 Subject: [PATCH 16/32] change csv check condition --- lib/tasks/migrate_download_stats.rake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake index 2653fb5d7..b20b5242d 100644 --- a/lib/tasks/migrate_download_stats.rake +++ b/lib/tasks/migrate_download_stats.rake @@ -17,7 +17,7 @@ namespace :migrate_download_stats do args = opts.order!(ARGV) {} opts.parse!(args) - unless options[:output_dir].present? && options[:output_dir][-4] == '.csv' + unless options[:output_dir].present? && options[:output_dir].end_with?('.csv') puts 'Please provide a valid output directory with a .csv extension. Got ' + options[:output_dir].to_s exit 1 end From 71cf41747d563c31b99b2aaa7adb157a621d7a62 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Fri, 23 Aug 2024 16:03:27 -0400 Subject: [PATCH 17/32] name change --- lib/tasks/migrate_download_stats.rake | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake index b20b5242d..8d2a752f2 100644 --- a/lib/tasks/migrate_download_stats.rake +++ b/lib/tasks/migrate_download_stats.rake @@ -4,14 +4,14 @@ require 'optparse' require 'optparse/date' namespace :migrate_download_stats do - desc 'list object ids for download stats migration' - task :list_ids, [:output_dir, :after] => :environment do |_t, _args| + desc 'output object record data for download stats migration into a csv' + task :list_record_data, [:output_dir, :after] => :environment do |_t, _args| start_time = Time.now - puts "[#{start_time.utc.iso8601}] starting listing of ids" + puts "[#{start_time.utc.iso8601}] starting listing of record data" options = {} opts = OptionParser.new - opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_ids -- [options]' + opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_record_data -- [options]' opts.on('-o', '--output-dir ARG', String, 'Directory list will be saved to') { |val| options[:output_dir] = val } opts.on('-a', '--after ARG', String, 'List objects which have been updated after this timestamp') { |val| options[:after] = val } args = opts.order!(ARGV) {} From 862a79300519665ee190baa28ce664185bdfedbe Mon Sep 17 00:00:00 2001 From: David Campbell Date: Mon, 26 Aug 2024 11:07:02 -0400 Subject: [PATCH 18/32] logging and wording change --- .../tasks/download_stats_migration_service.rb | 60 +++++++++++++------ lib/tasks/migrate_download_stats.rake | 2 +- .../download_stats_migration_service_spec.rb | 38 ++++++------ 3 files changed, 61 insertions(+), 39 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index ff46391b1..f460fac7d 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -2,41 +2,63 @@ module Tasks class DownloadStatsMigrationService PAGE_SIZE = 1000 - def list_record_info(output_path, after_timestamp = nil) + def list_work_info(output_path, after_timestamp = nil) query = FileDownloadStat.all query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? + total_works = query.count + timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp' - # Fetch the records in batches to handle large datasets - records = [] - query.find_each(batch_size: PAGE_SIZE) do |record| - records << { - file_id: record.file_id, - date: record.date, - downloads: record.downloads + # Log number of works retrieved and timestamp clause + Rails.logger.info("Listing works #{timestamp_clause} to #{output_path}") + Rails.logger.info("#{total_works} works from the hyrax local cache.") + + works = [] + works_retrieved_from_query_count = 0 + + Rails.logger.info("Retrieving works in batches of #{PAGE_SIZE}") + # Fetch the works in batches + query.find_each(batch_size: PAGE_SIZE) do |work| + works << { + file_id: work.file_id, + date: work.date, + downloads: work.downloads } + works_retrieved_from_query_count += 1 + log_progress(works_retrieved_from_query_count, total_works) end # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries - aggregated_records = aggregate_downloads(records) + aggregated_works = aggregate_downloads(works) - # Write the records to the specified CSV file + # Write the works to the specified CSV file CSV.open(output_path, 'w', write_headers: true, headers: ['file_id', 'date', 'downloads']) do |csv| - aggregated_records.each do |record| - csv << [record[:file_id], record[:date], record[:downloads]] + aggregated_works.each do |work| + csv << [work[:file_id], work[:date], work[:downloads]] end end end def migrate_to_new_table(csv_path) csv_data = CSV.read(csv_path, headers: true) - records = csv_data.map { |row| row.to_h.symbolize_keys } + works = csv_data.map { |row| row.to_h.symbolize_keys } # Recreate or update objects in new table - records.each do |stat| + works.each do |stat| create_hyc_download_stat(stat) end end + # Log progress at 25%, 50%, 75%, and 100% + def log_progress(works_retrieved_from_query_count, total_works, is_aggregation = false) + percentages = [0.25, 0.5, 0.75, 1.0] + log_intervals = percentages.map { |percent| (total_works * percent).to_i } + if log_intervals.include?(works_retrieved_from_query_count) + percentage_done = percentages[log_intervals.index(works_retrieved_from_query_count)] * 100 + process_type = is_aggregation ? 'Aggregation' : 'Retrieval' + Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{works_retrieved_from_query_count}/#{total_works} works)") + end + end + def create_hyc_download_stat(stat) hyc_download_stat = HycDownloadStat.find_or_initialize_by( fileset_id: stat[:file_id].to_s, @@ -60,13 +82,13 @@ def work_data_from_stat(stat) WorkUtilsHelper.fetch_work_data_by_fileset_id(stat[:file_id]) end - def aggregate_downloads(records) + def aggregate_downloads(works) aggregated_data = {} - records.each do |record| - file_id = record[:file_id] - truncated_date = record[:date].beginning_of_month - downloads = record[:downloads] + works.each do |work| + file_id = work[:file_id] + truncated_date = work[:date].beginning_of_month + downloads = work[:downloads] # Group the file_id and truncated date to be used as a key key = [file_id, truncated_date] diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake index 8d2a752f2..53088fa27 100644 --- a/lib/tasks/migrate_download_stats.rake +++ b/lib/tasks/migrate_download_stats.rake @@ -23,7 +23,7 @@ namespace :migrate_download_stats do end migration_service = Tasks::DownloadStatsMigrationService.new - old_stats_csv = migration_service.list_record_info(options[:output_dir], options[:after]) + old_stats_csv = migration_service.list_work_info(options[:output_dir], options[:after]) puts "Listing completed in #{Time.now - start_time}s" puts "Stored id list to file: #{options[:output_dir]}" exit 0 diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb index abe9a4791..edc45b19f 100644 --- a/spec/services/tasks/download_stats_migration_service_spec.rb +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -43,13 +43,13 @@ end end - describe '#list_record_info' do - it 'writes all records to the output CSV file' do + describe '#list_work_info' do + it 'writes all works to the output CSV file' do file_download_stats.flatten.each_with_index do |stat, index| allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] }) end - expected_records = [ + expected_works = [ { file_id: 'file_id_1', date: '2023-01-01 00:00:00 UTC', downloads: '20' }, { file_id: 'file_id_1', date: '2023-03-01 00:00:00 UTC', downloads: '20' }, { file_id: 'file_id_2', date: '2023-04-01 00:00:00 UTC', downloads: '20' }, @@ -57,10 +57,10 @@ { file_id: 'file_id_3', date: '2023-06-01 00:00:00 UTC', downloads: '20' }, { file_id: 'file_id_3', date: '2023-07-01 00:00:00 UTC', downloads: '20' } ] - service.list_record_info(output_path, nil) + service.list_work_info(output_path, nil) expect(File).to exist(output_path) - expect(csv_to_hash_array(output_path)).to match_array(expected_records) + expect(csv_to_hash_array(output_path)).to match_array(expected_works) end context 'with an after_timestamp' do @@ -79,14 +79,14 @@ end end - it 'filters records by the given timestamp' do - # Retrieve records created after 'updated_at' date for old stats - service.list_record_info(output_path, '2023-04-06 00:00:00 UTC') + it 'filters works by the given timestamp' do + # Retrieve works created after 'updated_at' date for old stats + service.list_work_info(output_path, '2023-04-06 00:00:00 UTC') puts "CSV data: #{csv_to_hash_array(output_path).inspect}" expect(File).to exist(output_path) - expect(csv_to_hash_array(output_path).map { |record| record[:file_id] }).to match_array(recent_stat_file_ids) - expect(csv_to_hash_array(output_path).map { |record| record[:file_id] }).not_to include(*old_stat_file_ids) + expect(csv_to_hash_array(output_path).map { |work| work[:file_id] }).to match_array(recent_stat_file_ids) + expect(csv_to_hash_array(output_path).map { |work| work[:file_id] }).not_to include(*old_stat_file_ids) end end end @@ -96,20 +96,20 @@ file_download_stats.flatten.each_with_index do |stat, index| allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] }) end - service.list_record_info(output_path, nil) + service.list_work_info(output_path, nil) service.migrate_to_new_table(output_path) end - it 'creates new HycDownloadStat records from the CSV file' do - csv_to_hash_array(output_path).each_with_index do |record, index| - work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(record[:file_id]) - hyc_download_stat = HycDownloadStat.find_by(fileset_id: record[:file_id], date: record[:date].to_date.beginning_of_month) + it 'creates new HycDownloadStat works from the CSV file' do + csv_to_hash_array(output_path).each_with_index do |work, index| + work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(work[:file_id]) + hyc_download_stat = HycDownloadStat.find_by(fileset_id: work[:file_id], date: work[:date].to_date.beginning_of_month) expect(hyc_download_stat).to be_present - expect(hyc_download_stat.fileset_id).to eq(record[:file_id]) + expect(hyc_download_stat.fileset_id).to eq(work[:file_id]) expect(hyc_download_stat.work_id).to eq(work_data[:work_id]) - expect(hyc_download_stat.date).to eq(record[:date].to_date) - # Each mocked record has 10 downloads per month, so the download count should be 20 + expect(hyc_download_stat.date).to eq(work[:date].to_date) + # Each mocked work has 10 downloads per month, so the download count should be 20 expect(hyc_download_stat.download_count).to eq(20) end end @@ -122,7 +122,7 @@ def csv_to_hash_array(file_path) # Helper method to convert an array of FileDownloadStat objects to an array of hashes # Checks for truncated date to the beginning of the month - def expected_records_for(stats) + def expected_works_for(stats) stats.map do |stat| { file_id: stat.file_id, From d335b7251654362bc5fc094579b9a3708410f115 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Mon, 26 Aug 2024 11:07:21 -0400 Subject: [PATCH 19/32] rubocop --- .../tasks/download_stats_migration_service.rb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index f460fac7d..d01de5b7f 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -50,13 +50,13 @@ def migrate_to_new_table(csv_path) # Log progress at 25%, 50%, 75%, and 100% def log_progress(works_retrieved_from_query_count, total_works, is_aggregation = false) - percentages = [0.25, 0.5, 0.75, 1.0] - log_intervals = percentages.map { |percent| (total_works * percent).to_i } - if log_intervals.include?(works_retrieved_from_query_count) - percentage_done = percentages[log_intervals.index(works_retrieved_from_query_count)] * 100 - process_type = is_aggregation ? 'Aggregation' : 'Retrieval' - Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{works_retrieved_from_query_count}/#{total_works} works)") - end + percentages = [0.25, 0.5, 0.75, 1.0] + log_intervals = percentages.map { |percent| (total_works * percent).to_i } + if log_intervals.include?(works_retrieved_from_query_count) + percentage_done = percentages[log_intervals.index(works_retrieved_from_query_count)] * 100 + process_type = is_aggregation ? 'Aggregation' : 'Retrieval' + Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{works_retrieved_from_query_count}/#{total_works} works)") + end end def create_hyc_download_stat(stat) From aa9bd558cba303f9e0ffb7dd8252ea4ea105c6ab Mon Sep 17 00:00:00 2001 From: David Campbell Date: Mon, 26 Aug 2024 11:21:50 -0400 Subject: [PATCH 20/32] phrasing change and logging --- .../tasks/download_stats_migration_service.rb | 69 ++++++++++--------- lib/tasks/migrate_download_stats.rake | 10 +-- .../download_stats_migration_service_spec.rb | 8 +-- 3 files changed, 46 insertions(+), 41 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index d01de5b7f..4444f6671 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -2,60 +2,63 @@ module Tasks class DownloadStatsMigrationService PAGE_SIZE = 1000 - def list_work_info(output_path, after_timestamp = nil) + def list_work_stat_info(output_path, after_timestamp = nil) query = FileDownloadStat.all query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? - total_works = query.count + total_work_stats = query.count timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp' - # Log number of works retrieved and timestamp clause - Rails.logger.info("Listing works #{timestamp_clause} to #{output_path}") - Rails.logger.info("#{total_works} works from the hyrax local cache.") + # Log number of work_stats retrieved and timestamp clause + Rails.logger.info("Listing work_stats #{timestamp_clause} to #{output_path}") + Rails.logger.info("#{total_work_stats} work_stats from the hyrax local cache.") - works = [] - works_retrieved_from_query_count = 0 + work_stats = [] + work_stats_retrieved_from_query_count = 0 - Rails.logger.info("Retrieving works in batches of #{PAGE_SIZE}") - # Fetch the works in batches - query.find_each(batch_size: PAGE_SIZE) do |work| - works << { - file_id: work.file_id, - date: work.date, - downloads: work.downloads + Rails.logger.info("Retrieving work_stats in batches of #{PAGE_SIZE}") + # Fetch the work_stats in batches + query.find_each(batch_size: PAGE_SIZE) do |stat| + work_stats << { + file_id: stat.file_id, + date: stat.date, + downloads: stat.downloads } - works_retrieved_from_query_count += 1 - log_progress(works_retrieved_from_query_count, total_works) + work_stats_retrieved_from_query_count += 1 + log_progress(work_stats_retrieved_from_query_count, total_work_stats) end # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries - aggregated_works = aggregate_downloads(works) + Rails.logger.info("Aggregating daily stats into monthly stats") + aggregated_work_stats = aggregate_downloads(work_stats) - # Write the works to the specified CSV file + Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{work_stats.count} daily stats") + + # Write the work_stats to the specified CSV file CSV.open(output_path, 'w', write_headers: true, headers: ['file_id', 'date', 'downloads']) do |csv| - aggregated_works.each do |work| - csv << [work[:file_id], work[:date], work[:downloads]] + aggregated_work_stats.each do |stat| + csv << [stat[:file_id], stat[:date], stat[:downloads]] end end end def migrate_to_new_table(csv_path) csv_data = CSV.read(csv_path, headers: true) - works = csv_data.map { |row| row.to_h.symbolize_keys } + work_stats = csv_data.map { |row| row.to_h.symbolize_keys } # Recreate or update objects in new table - works.each do |stat| + work_stats.each do |stat| create_hyc_download_stat(stat) end end # Log progress at 25%, 50%, 75%, and 100% - def log_progress(works_retrieved_from_query_count, total_works, is_aggregation = false) + def log_progress(work_stats_count, total_work_stats, is_aggregation = false) percentages = [0.25, 0.5, 0.75, 1.0] - log_intervals = percentages.map { |percent| (total_works * percent).to_i } - if log_intervals.include?(works_retrieved_from_query_count) - percentage_done = percentages[log_intervals.index(works_retrieved_from_query_count)] * 100 + log_intervals = percentages.map { |percent| (total_work_stats * percent).to_i } + if log_intervals.include?(work_stats_count) + percentage_done = percentages[log_intervals.index(work_stats_count)] * 100 process_type = is_aggregation ? 'Aggregation' : 'Retrieval' - Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{works_retrieved_from_query_count}/#{total_works} works)") + Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{work_stats_count}/#{total_work_stats} work_stats)") end end @@ -82,13 +85,14 @@ def work_data_from_stat(stat) WorkUtilsHelper.fetch_work_data_by_fileset_id(stat[:file_id]) end - def aggregate_downloads(works) + def aggregate_downloads(work_stats) aggregated_data = {} + aggregated_work_stats_count = 0 - works.each do |work| - file_id = work[:file_id] - truncated_date = work[:date].beginning_of_month - downloads = work[:downloads] + work_stats.each do |stat| + file_id = stat[:file_id] + truncated_date = stat[:date].beginning_of_month + downloads = stat[:downloads] # Group the file_id and truncated date to be used as a key key = [file_id, truncated_date] @@ -96,6 +100,7 @@ def aggregate_downloads(works) aggregated_data[key] ||= { file_id: file_id, date: truncated_date, downloads: 0 } # Sum the downloads for each key aggregated_data[key][:downloads] += downloads + log_progress(aggregated_work_stats_count, work_stats.count, is_aggregation: true) end aggregated_data.values end diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake index 53088fa27..15edc556e 100644 --- a/lib/tasks/migrate_download_stats.rake +++ b/lib/tasks/migrate_download_stats.rake @@ -4,14 +4,14 @@ require 'optparse' require 'optparse/date' namespace :migrate_download_stats do - desc 'output object record data for download stats migration into a csv' - task :list_record_data, [:output_dir, :after] => :environment do |_t, _args| + desc 'output rows for download stat migration into a csv' + task :list_rows, [:output_dir, :after] => :environment do |_t, _args| start_time = Time.now - puts "[#{start_time.utc.iso8601}] starting listing of record data" + puts "[#{start_time.utc.iso8601}] starting listing of work data" options = {} opts = OptionParser.new - opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_record_data -- [options]' + opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_rows -- [options]' opts.on('-o', '--output-dir ARG', String, 'Directory list will be saved to') { |val| options[:output_dir] = val } opts.on('-a', '--after ARG', String, 'List objects which have been updated after this timestamp') { |val| options[:after] = val } args = opts.order!(ARGV) {} @@ -23,7 +23,7 @@ namespace :migrate_download_stats do end migration_service = Tasks::DownloadStatsMigrationService.new - old_stats_csv = migration_service.list_work_info(options[:output_dir], options[:after]) + old_stats_csv = migration_service.list_work_stat_info(options[:output_dir], options[:after]) puts "Listing completed in #{Time.now - start_time}s" puts "Stored id list to file: #{options[:output_dir]}" exit 0 diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb index edc45b19f..60a9ccc02 100644 --- a/spec/services/tasks/download_stats_migration_service_spec.rb +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -43,7 +43,7 @@ end end - describe '#list_work_info' do + describe '#list_work_stat_info' do it 'writes all works to the output CSV file' do file_download_stats.flatten.each_with_index do |stat, index| allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] }) @@ -57,7 +57,7 @@ { file_id: 'file_id_3', date: '2023-06-01 00:00:00 UTC', downloads: '20' }, { file_id: 'file_id_3', date: '2023-07-01 00:00:00 UTC', downloads: '20' } ] - service.list_work_info(output_path, nil) + service.list_work_stat_info(output_path, nil) expect(File).to exist(output_path) expect(csv_to_hash_array(output_path)).to match_array(expected_works) @@ -81,7 +81,7 @@ it 'filters works by the given timestamp' do # Retrieve works created after 'updated_at' date for old stats - service.list_work_info(output_path, '2023-04-06 00:00:00 UTC') + service.list_work_stat_info(output_path, '2023-04-06 00:00:00 UTC') puts "CSV data: #{csv_to_hash_array(output_path).inspect}" expect(File).to exist(output_path) @@ -96,7 +96,7 @@ file_download_stats.flatten.each_with_index do |stat, index| allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] }) end - service.list_work_info(output_path, nil) + service.list_work_stat_info(output_path, nil) service.migrate_to_new_table(output_path) end From a914e06df4324813fc1bb24f3d4ea5b8a3ad13e5 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Mon, 26 Aug 2024 11:31:55 -0400 Subject: [PATCH 21/32] slight logging change and moving csv writing into another function --- .../tasks/download_stats_migration_service.rb | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index 4444f6671..92fec6b47 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -8,9 +8,8 @@ def list_work_stat_info(output_path, after_timestamp = nil) total_work_stats = query.count timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp' - # Log number of work_stats retrieved and timestamp clause - Rails.logger.info("Listing work_stats #{timestamp_clause} to #{output_path}") - Rails.logger.info("#{total_work_stats} work_stats from the hyrax local cache.") + # Log number of work stats retrieved and timestamp clause + Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.") work_stats = [] work_stats_retrieved_from_query_count = 0 @@ -30,15 +29,10 @@ def list_work_stat_info(output_path, after_timestamp = nil) # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries Rails.logger.info("Aggregating daily stats into monthly stats") aggregated_work_stats = aggregate_downloads(work_stats) - Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{work_stats.count} daily stats") # Write the work_stats to the specified CSV file - CSV.open(output_path, 'w', write_headers: true, headers: ['file_id', 'date', 'downloads']) do |csv| - aggregated_work_stats.each do |stat| - csv << [stat[:file_id], stat[:date], stat[:downloads]] - end - end + write_to_csv(output_path, aggregated_work_stats) end def migrate_to_new_table(csv_path) @@ -104,5 +98,16 @@ def aggregate_downloads(work_stats) end aggregated_data.values end + + # Method to write work stats to a CSV file + def write_to_csv(output_path, work_stats, headers = ['file_id', 'date', 'downloads']) + CSV.open(output_path, 'w', write_headers: true, headers: headers) do |csv| + work_stats.each do |stat| + csv << [stat[:file_id], stat[:date], stat[:downloads]] + end + end + Rails.logger.info("Work stats successfully written to #{output_path}") + end + end end From 59145a5a9721648c112855addb4d5d1ef164fdb4 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Mon, 26 Aug 2024 11:34:00 -0400 Subject: [PATCH 22/32] rubocop --- app/services/tasks/download_stats_migration_service.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index 92fec6b47..c4c37f8c6 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -27,7 +27,7 @@ def list_work_stat_info(output_path, after_timestamp = nil) end # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries - Rails.logger.info("Aggregating daily stats into monthly stats") + Rails.logger.info('Aggregating daily stats into monthly stats') aggregated_work_stats = aggregate_downloads(work_stats) Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{work_stats.count} daily stats") @@ -45,6 +45,8 @@ def migrate_to_new_table(csv_path) end end + private + # Log progress at 25%, 50%, 75%, and 100% def log_progress(work_stats_count, total_work_stats, is_aggregation = false) percentages = [0.25, 0.5, 0.75, 1.0] From df9279db84d5a09f53303fa6c6e48be7a9f56023 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Mon, 26 Aug 2024 12:41:02 -0400 Subject: [PATCH 23/32] changed arguments for log progress function --- .../tasks/download_stats_migration_service.rb | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index c4c37f8c6..e8cb37943 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -23,7 +23,7 @@ def list_work_stat_info(output_path, after_timestamp = nil) downloads: stat.downloads } work_stats_retrieved_from_query_count += 1 - log_progress(work_stats_retrieved_from_query_count, total_work_stats) + log_progress(work_stats_retrieved_from_query_count, total_work_stats, 'Retrieval') end # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries @@ -38,22 +38,25 @@ def list_work_stat_info(output_path, after_timestamp = nil) def migrate_to_new_table(csv_path) csv_data = CSV.read(csv_path, headers: true) work_stats = csv_data.map { |row| row.to_h.symbolize_keys } + migrated_work_stats_count = 0 + Rails.logger.info("Migrating #{work_stats.count} work stats to the new table.") # Recreate or update objects in new table work_stats.each do |stat| create_hyc_download_stat(stat) + migrated_work_stats_count += 1 + log_progress(migrated_work_stats_count, work_stats.count, 'Migration') end end private # Log progress at 25%, 50%, 75%, and 100% - def log_progress(work_stats_count, total_work_stats, is_aggregation = false) + def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval') percentages = [0.25, 0.5, 0.75, 1.0] log_intervals = percentages.map { |percent| (total_work_stats * percent).to_i } if log_intervals.include?(work_stats_count) percentage_done = percentages[log_intervals.index(work_stats_count)] * 100 - process_type = is_aggregation ? 'Aggregation' : 'Retrieval' Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{work_stats_count}/#{total_work_stats} work_stats)") end end @@ -96,7 +99,8 @@ def aggregate_downloads(work_stats) aggregated_data[key] ||= { file_id: file_id, date: truncated_date, downloads: 0 } # Sum the downloads for each key aggregated_data[key][:downloads] += downloads - log_progress(aggregated_work_stats_count, work_stats.count, is_aggregation: true) + aggregated_work_stats_count += 1 + log_progress(aggregated_work_stats_count, work_stats.count, 'Aggregation') end aggregated_data.values end From 27d54b7ad078d9ac651b28681a603370863aa0de Mon Sep 17 00:00:00 2001 From: David Campbell Date: Mon, 26 Aug 2024 13:43:18 -0400 Subject: [PATCH 24/32] progress tracking in migration service --- .../tasks/download_stats_migration_service.rb | 94 ++++++++++++++----- 1 file changed, 72 insertions(+), 22 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index e8cb37943..456923025 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -37,16 +37,24 @@ def list_work_stat_info(output_path, after_timestamp = nil) def migrate_to_new_table(csv_path) csv_data = CSV.read(csv_path, headers: true) - work_stats = csv_data.map { |row| row.to_h.symbolize_keys } - migrated_work_stats_count = 0 - - Rails.logger.info("Migrating #{work_stats.count} work stats to the new table.") + csv_data_stats = csv_data.map { |row| row.to_h.symbolize_keys } + progress_tracker = { + all_categories: 0, + created: 0, + updated: 0, + skipped: 0, + failed: 0 + } + + Rails.logger.info("Migrating #{csv_data_stats.count} work stats to the new table.") # Recreate or update objects in new table - work_stats.each do |stat| - create_hyc_download_stat(stat) - migrated_work_stats_count += 1 - log_progress(migrated_work_stats_count, work_stats.count, 'Migration') + csv_data_stats.each do |stat| + create_hyc_download_stat(stat, progress_tracker) + # WIP: Thinking about moving updates into an update progress tracker method + # progress_tracker[:all_categories] += 1 + log_progress(progress_tracker[:all_categories], csv_data_stats.count, 'Migration') end + end private @@ -61,21 +69,45 @@ def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval') end end - def create_hyc_download_stat(stat) - hyc_download_stat = HycDownloadStat.find_or_initialize_by( - fileset_id: stat[:file_id].to_s, - date: stat[:date] - ) - work_data = work_data_from_stat(stat) - hyc_download_stat.assign_attributes( - fileset_id: stat[:file_id], - work_id: work_data[:work_id], - admin_set_id: work_data[:admin_set_id], - work_type: work_data[:work_type], - date: stat[:date], - download_count: stat[:downloads], + def create_hyc_download_stat(stat, progress_tracker) + begin + hyc_download_stat = HycDownloadStat.find_or_initialize_by( + fileset_id: stat[:file_id].to_s, + date: stat[:date] ) - hyc_download_stat.save + work_data = work_data_from_stat(stat) + hyc_download_stat.assign_attributes( + fileset_id: stat[:file_id], + work_id: work_data[:work_id], + admin_set_id: work_data[:admin_set_id], + work_type: work_data[:work_type], + date: stat[:date], + download_count: stat[:downloads], + ) + rescue StandardError => e + Rails.logger.error("Failed to create HycDownloadStat for #{stat.inspect}: #{e.message}") + progress_tracker[:failed] += 1 + end + save_hyc_download_stat(hyc_download_stat, stat, progress_tracker) + + # if hyc_download_stat.new_record? + # if hyc_download_stat.save + # return :created + # else + # Rails.logger.error("Failed to create HycDownloadStat for #{stat.inspect}: #{hyc_download_stat.errors.full_messages.join(', ')}") + # return :failed + # end + # elsif hyc_download_stat.changed? + # if hyc_download_stat.save + # return :updated + # else + # Rails.logger.error("Failed to update HycDownloadStat for #{stat.inspect}: #{hyc_download_stat.errors.full_messages.join(', ')}") + # return :failed + # end + # else + # return :skipped + # end + end # Similar implementation to work_data in DownloadAnalyticsBehavior @@ -115,5 +147,23 @@ def write_to_csv(output_path, work_stats, headers = ['file_id', 'date', 'downloa Rails.logger.info("Work stats successfully written to #{output_path}") end + # Method to save the HycDownloadStat object and update the progress tracker + def save_hyc_download_stat(hyc_download_stat, stat, progress_tracker) + begin + if hyc_download_stat.new_record? + hyc_download_stat.save + progress_tracker[:created] += 1 + elsif hyc_download_stat.changed? + hyc_download_stat.save + progress_tracker[:updated] += 1 + else + progress_tracker[:skipped] += 1 + end + rescue StandardError => e + Rails.logger.error("Error saving new row to HycDownloadStat: #{stat.inspect}: #{e.message}") + progress_tracker[:failed] += 1 + end + end + end end From 50058c4f9ac3792fd8853a7bbca36f930fc58f8e Mon Sep 17 00:00:00 2001 From: David Campbell Date: Mon, 26 Aug 2024 14:56:53 -0400 Subject: [PATCH 25/32] error handling --- .../tasks/download_stats_migration_service.rb | 121 ++++++++---------- 1 file changed, 55 insertions(+), 66 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index 456923025..6c92ee488 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -3,58 +3,66 @@ module Tasks class DownloadStatsMigrationService PAGE_SIZE = 1000 def list_work_stat_info(output_path, after_timestamp = nil) - query = FileDownloadStat.all - query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? - total_work_stats = query.count - timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp' - - # Log number of work stats retrieved and timestamp clause - Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.") - - work_stats = [] - work_stats_retrieved_from_query_count = 0 - - Rails.logger.info("Retrieving work_stats in batches of #{PAGE_SIZE}") - # Fetch the work_stats in batches - query.find_each(batch_size: PAGE_SIZE) do |stat| - work_stats << { - file_id: stat.file_id, - date: stat.date, - downloads: stat.downloads - } - work_stats_retrieved_from_query_count += 1 - log_progress(work_stats_retrieved_from_query_count, total_work_stats, 'Retrieval') - end + begin + query = FileDownloadStat.all + query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? + total_work_stats = query.count + timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp' + + # Log number of work stats retrieved and timestamp clause + Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.") + + work_stats = [] + work_stats_retrieved_from_query_count = 0 + + Rails.logger.info("Retrieving work_stats in batches of #{PAGE_SIZE}") + # Fetch the work_stats in batches + query.find_each(batch_size: PAGE_SIZE) do |stat| + work_stats << { + file_id: stat.file_id, + date: stat.date, + downloads: stat.downloads + } + work_stats_retrieved_from_query_count += 1 + log_progress(work_stats_retrieved_from_query_count, total_work_stats, 'Retrieval') + end - # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries - Rails.logger.info('Aggregating daily stats into monthly stats') - aggregated_work_stats = aggregate_downloads(work_stats) - Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{work_stats.count} daily stats") + # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries + Rails.logger.info('Aggregating daily stats into monthly stats') + aggregated_work_stats = aggregate_downloads(work_stats) + Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{work_stats.count} daily stats") - # Write the work_stats to the specified CSV file - write_to_csv(output_path, aggregated_work_stats) + # Write the work_stats to the specified CSV file + write_to_csv(output_path, aggregated_work_stats) + rescue StandardError => e + Rails.logger.error("An error occurred while listing work stats: #{e.message}") + Rails.logger.error(e.backtrace.join("\n")) + end end def migrate_to_new_table(csv_path) - csv_data = CSV.read(csv_path, headers: true) - csv_data_stats = csv_data.map { |row| row.to_h.symbolize_keys } - progress_tracker = { - all_categories: 0, - created: 0, - updated: 0, - skipped: 0, - failed: 0 - } - - Rails.logger.info("Migrating #{csv_data_stats.count} work stats to the new table.") - # Recreate or update objects in new table - csv_data_stats.each do |stat| - create_hyc_download_stat(stat, progress_tracker) - # WIP: Thinking about moving updates into an update progress tracker method - # progress_tracker[:all_categories] += 1 - log_progress(progress_tracker[:all_categories], csv_data_stats.count, 'Migration') + begin + csv_data = CSV.read(csv_path, headers: true) + csv_data_stats = csv_data.map { |row| row.to_h.symbolize_keys } + progress_tracker = { + all_categories: 0, + created: 0, + updated: 0, + skipped: 0, + failed: 0 + } + + Rails.logger.info("Migrating #{csv_data_stats.count} work stats to the new table.") + # Recreate or update objects in new table + csv_data_stats.each do |stat| + create_hyc_download_stat(stat, progress_tracker) + log_progress(progress_tracker[:all_categories], csv_data_stats.count, 'Migration') + end + Rails.logger.info("Migration complete: #{progress_tracker[:created]} created, #{progress_tracker[:updated]} updated, #{progress_tracker[:skipped]} skipped, #{progress_tracker[:failed]} failed") + rescue StandardError => e + Rails.logger.error("An error occurred while migrating work stats: #{e.message}") + Rails.logger.error(e.backtrace.join("\n")) end - end private @@ -89,25 +97,6 @@ def create_hyc_download_stat(stat, progress_tracker) progress_tracker[:failed] += 1 end save_hyc_download_stat(hyc_download_stat, stat, progress_tracker) - - # if hyc_download_stat.new_record? - # if hyc_download_stat.save - # return :created - # else - # Rails.logger.error("Failed to create HycDownloadStat for #{stat.inspect}: #{hyc_download_stat.errors.full_messages.join(', ')}") - # return :failed - # end - # elsif hyc_download_stat.changed? - # if hyc_download_stat.save - # return :updated - # else - # Rails.logger.error("Failed to update HycDownloadStat for #{stat.inspect}: #{hyc_download_stat.errors.full_messages.join(', ')}") - # return :failed - # end - # else - # return :skipped - # end - end # Similar implementation to work_data in DownloadAnalyticsBehavior @@ -163,7 +152,7 @@ def save_hyc_download_stat(hyc_download_stat, stat, progress_tracker) Rails.logger.error("Error saving new row to HycDownloadStat: #{stat.inspect}: #{e.message}") progress_tracker[:failed] += 1 end - end + end end end From e9ea6dc92eaff109acc3562b2467ead17ef9b267 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Mon, 26 Aug 2024 18:25:34 -0400 Subject: [PATCH 26/32] increment all_categories in progress tracker --- app/services/tasks/download_stats_migration_service.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index 6c92ee488..92a035950 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -56,6 +56,7 @@ def migrate_to_new_table(csv_path) # Recreate or update objects in new table csv_data_stats.each do |stat| create_hyc_download_stat(stat, progress_tracker) + progress_tracker[:all_categories] += 1 log_progress(progress_tracker[:all_categories], csv_data_stats.count, 'Migration') end Rails.logger.info("Migration complete: #{progress_tracker[:created]} created, #{progress_tracker[:updated]} updated, #{progress_tracker[:skipped]} skipped, #{progress_tracker[:failed]} failed") From 115df4222c1643a535c0f9b4b05141f2c57aa0bc Mon Sep 17 00:00:00 2001 From: David Campbell Date: Tue, 27 Aug 2024 17:09:51 -0400 Subject: [PATCH 27/32] tests --- .../tasks/download_stats_migration_service_spec.rb | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb index 60a9ccc02..467a61199 100644 --- a/spec/services/tasks/download_stats_migration_service_spec.rb +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -63,6 +63,13 @@ expect(csv_to_hash_array(output_path)).to match_array(expected_works) end + it 'handles and logs errors' do + allow(Rails.logger).to receive(:error) + allow(FileDownloadStat).to receive(:all).and_raise(StandardError, 'Simulated database query failure') + service.list_work_stat_info(output_path, nil) + expect(Rails.logger).to have_received(:error).with('An error occurred while listing work stats: Simulated database query failure') + end + context 'with an after_timestamp' do let(:recent_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: '2023-05-05 00:00:00 UTC') } let(:old_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: '2023-04-05 00:00:00 UTC') } @@ -113,6 +120,13 @@ expect(hyc_download_stat.download_count).to eq(20) end end + + it 'handles and logs errors' do + allow(CSV).to receive(:read).and_raise(StandardError, 'Simulated CSV read failure') + allow(Rails.logger).to receive(:error) + service.migrate_to_new_table(output_path) + expect(Rails.logger).to have_received(:error).with('An error occurred while migrating work stats: Simulated CSV read failure') + end end private From 177dc77f615db0e60cbad069e90c4574c9b04384 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Tue, 27 Aug 2024 18:09:51 -0400 Subject: [PATCH 28/32] remove comments --- spec/factories/solr_query_result.rb | 5 ----- 1 file changed, 5 deletions(-) diff --git a/spec/factories/solr_query_result.rb b/spec/factories/solr_query_result.rb index e17445ceb..b718f4e95 100644 --- a/spec/factories/solr_query_result.rb +++ b/spec/factories/solr_query_result.rb @@ -9,11 +9,6 @@ def generate_id(n) # Factory for creating Solr query results FactoryBot.define do factory :solr_query_result, class: OpenStruct do - - # General attributes that could be shared between works and admin sets - # sequence(:admin_set_id) { |n| "admin_set_id_#{n}" } - # sequence(:admin_set_name) { |n| "Admin Set #{n}" } - trait :work do # Default values for has_model_ssim, admin_set_tesim, and file_set_ids_ssim has_model_ssim { ['Article'] } From c2d339c303ad05855050f3e4d78412481874fa32 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Wed, 28 Aug 2024 13:29:16 -0400 Subject: [PATCH 29/32] update test to avoid checking for the same value --- .../tasks/download_stats_migration_service.rb | 2 +- .../download_stats_migration_service_spec.rb | 51 +++++++++++-------- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index 92a035950..b5edcb7de 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -27,7 +27,7 @@ def list_work_stat_info(output_path, after_timestamp = nil) log_progress(work_stats_retrieved_from_query_count, total_work_stats, 'Retrieval') end - # Perform aggregation of daily stats ino monthly stats in Ruby, encountered issues with SQL queries + # Perform aggregation of daily stats into monthly stats in Ruby, encountered issues with SQL queries Rails.logger.info('Aggregating daily stats into monthly stats') aggregated_work_stats = aggregate_downloads(work_stats) Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{work_stats.count} daily stats") diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb index 467a61199..90e5f6cf6 100644 --- a/spec/services/tasks/download_stats_migration_service_spec.rb +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -18,25 +18,32 @@ # Smaller groups to allow for easier testing for aggregation of download stats from daily to monthly let(:file_download_stats) { [[ - FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15), downloads: 10, file_id: 'file_id_1'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30), downloads: 10, file_id: 'file_id_1'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15), downloads: 5, file_id: 'file_id_1'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30), downloads: 5, file_id: 'file_id_1'), FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 15), downloads: 10, file_id: 'file_id_1'), FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 30), downloads: 10, file_id: 'file_id_1'), ], [ - FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 15), downloads: 10, file_id: 'file_id_2'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 30), downloads: 10, file_id: 'file_id_2'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 15), downloads: 10, file_id: 'file_id_2'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 30), downloads: 10, file_id: 'file_id_2'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 15), downloads: 25, file_id: 'file_id_2'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 30), downloads: 25, file_id: 'file_id_2'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 15), downloads: 50, file_id: 'file_id_2'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 30), downloads: 50, file_id: 'file_id_2'), ], [ - FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15), downloads: 10, file_id: 'file_id_3'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30), downloads: 10, file_id: 'file_id_3'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 15), downloads: 10, file_id: 'file_id_3'), - FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 30), downloads: 10, file_id: 'file_id_3'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15), downloads: 100, file_id: 'file_id_3'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30), downloads: 100, file_id: 'file_id_3'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 15), downloads: 150, file_id: 'file_id_3'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 30), downloads: 150, file_id: 'file_id_3'), ]] } + # Create a hash of [fileset_id, date.beginning_of_month] => download count for each file_download_stats + let(:expected_aggregated_download_count) do + file_download_stats.flatten.each_with_object(Hash.new(0)) do |stat, hash| + hash[[stat.file_id, stat.date.beginning_of_month.to_datetime]] += stat.downloads + end + end + let(:mock_works) do file_download_stats.flatten.map do |stat| FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) @@ -50,12 +57,12 @@ end expected_works = [ - { file_id: 'file_id_1', date: '2023-01-01 00:00:00 UTC', downloads: '20' }, + { file_id: 'file_id_1', date: '2023-01-01 00:00:00 UTC', downloads: '10' }, { file_id: 'file_id_1', date: '2023-03-01 00:00:00 UTC', downloads: '20' }, - { file_id: 'file_id_2', date: '2023-04-01 00:00:00 UTC', downloads: '20' }, - { file_id: 'file_id_2', date: '2023-05-01 00:00:00 UTC', downloads: '20' }, - { file_id: 'file_id_3', date: '2023-06-01 00:00:00 UTC', downloads: '20' }, - { file_id: 'file_id_3', date: '2023-07-01 00:00:00 UTC', downloads: '20' } + { file_id: 'file_id_2', date: '2023-04-01 00:00:00 UTC', downloads: '50' }, + { file_id: 'file_id_2', date: '2023-05-01 00:00:00 UTC', downloads: '100' }, + { file_id: 'file_id_3', date: '2023-06-01 00:00:00 UTC', downloads: '200' }, + { file_id: 'file_id_3', date: '2023-07-01 00:00:00 UTC', downloads: '300' } ] service.list_work_stat_info(output_path, nil) @@ -108,16 +115,16 @@ end it 'creates new HycDownloadStat works from the CSV file' do - csv_to_hash_array(output_path).each_with_index do |work, index| - work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(work[:file_id]) - hyc_download_stat = HycDownloadStat.find_by(fileset_id: work[:file_id], date: work[:date].to_date.beginning_of_month) + csv_to_hash_array(output_path).each_with_index do |csv_row, index| + work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(csv_row[:file_id]) + csv_row_date = Date.parse(csv_row[:date]).beginning_of_month + hyc_download_stat = HycDownloadStat.find_by(fileset_id: csv_row[:file_id], date: csv_row_date) expect(hyc_download_stat).to be_present - expect(hyc_download_stat.fileset_id).to eq(work[:file_id]) + expect(hyc_download_stat.fileset_id).to eq(csv_row[:file_id]) expect(hyc_download_stat.work_id).to eq(work_data[:work_id]) - expect(hyc_download_stat.date).to eq(work[:date].to_date) - # Each mocked work has 10 downloads per month, so the download count should be 20 - expect(hyc_download_stat.download_count).to eq(20) + expect(hyc_download_stat.date).to eq(csv_row[:date].to_date) + expect(hyc_download_stat.download_count).to eq(expected_aggregated_download_count[[csv_row[:file_id], csv_row_date]]) end end From c7b2b6009b87f7019113d58e1c3ab9653efa5536 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Wed, 28 Aug 2024 14:36:31 -0400 Subject: [PATCH 30/32] aggregate stats during retrieval --- .../tasks/download_stats_migration_service.rb | 50 ++++++------------- 1 file changed, 15 insertions(+), 35 deletions(-) diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index b5edcb7de..c8865193f 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -12,25 +12,25 @@ def list_work_stat_info(output_path, after_timestamp = nil) # Log number of work stats retrieved and timestamp clause Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.") - work_stats = [] + aggregated_data = {} work_stats_retrieved_from_query_count = 0 - Rails.logger.info("Retrieving work_stats in batches of #{PAGE_SIZE}") - # Fetch the work_stats in batches + Rails.logger.info('Retrieving work_stats from the database') + # Fetch the work_stats and aggregate them into monthly stats in Ruby, encountered issues with SQL queries query.find_each(batch_size: PAGE_SIZE) do |stat| - work_stats << { - file_id: stat.file_id, - date: stat.date, - downloads: stat.downloads - } + truncated_date = stat.date.beginning_of_month + # Group the file_id and truncated date to be used as a key + key = [stat.file_id, truncated_date] + # Initialize the hash for the key if it doesn't exist + aggregated_data[key] ||= { file_id: stat.file_id, date: truncated_date, downloads: 0 } + # Sum the downloads for each key + aggregated_data[key][:downloads] += stat.downloads work_stats_retrieved_from_query_count += 1 - log_progress(work_stats_retrieved_from_query_count, total_work_stats, 'Retrieval') + log_progress(work_stats_retrieved_from_query_count, total_work_stats) end - # Perform aggregation of daily stats into monthly stats in Ruby, encountered issues with SQL queries - Rails.logger.info('Aggregating daily stats into monthly stats') - aggregated_work_stats = aggregate_downloads(work_stats) - Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{work_stats.count} daily stats") + aggregated_work_stats = aggregated_data.values + Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{total_work_stats} daily stats") # Write the work_stats to the specified CSV file write_to_csv(output_path, aggregated_work_stats) @@ -69,7 +69,7 @@ def migrate_to_new_table(csv_path) private # Log progress at 25%, 50%, 75%, and 100% - def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval') + def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval and Aggregation') percentages = [0.25, 0.5, 0.75, 1.0] log_intervals = percentages.map { |percent| (total_work_stats * percent).to_i } if log_intervals.include?(work_stats_count) @@ -106,29 +106,9 @@ def work_data_from_stat(stat) WorkUtilsHelper.fetch_work_data_by_fileset_id(stat[:file_id]) end - def aggregate_downloads(work_stats) - aggregated_data = {} - aggregated_work_stats_count = 0 - - work_stats.each do |stat| - file_id = stat[:file_id] - truncated_date = stat[:date].beginning_of_month - downloads = stat[:downloads] - - # Group the file_id and truncated date to be used as a key - key = [file_id, truncated_date] - # Initialize the hash for the key if it doesn't exist - aggregated_data[key] ||= { file_id: file_id, date: truncated_date, downloads: 0 } - # Sum the downloads for each key - aggregated_data[key][:downloads] += downloads - aggregated_work_stats_count += 1 - log_progress(aggregated_work_stats_count, work_stats.count, 'Aggregation') - end - aggregated_data.values - end - # Method to write work stats to a CSV file def write_to_csv(output_path, work_stats, headers = ['file_id', 'date', 'downloads']) + puts "Inspect work_stats: #{work_stats.inspect}" CSV.open(output_path, 'w', write_headers: true, headers: headers) do |csv| work_stats.each do |stat| csv << [stat[:file_id], stat[:date], stat[:downloads]] From 184e896a0c16bd61f454ad34c60862f60990fbd8 Mon Sep 17 00:00:00 2001 From: David Campbell Date: Wed, 28 Aug 2024 15:44:42 -0400 Subject: [PATCH 31/32] tests --- .../download_stats_migration_service_spec.rb | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb index 90e5f6cf6..04ea74f5d 100644 --- a/spec/services/tasks/download_stats_migration_service_spec.rb +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -134,8 +134,29 @@ service.migrate_to_new_table(output_path) expect(Rails.logger).to have_received(:error).with('An error occurred while migrating work stats: Simulated CSV read failure') end + + context 'if a failure occurs during a private function' do + it 'handles and logs errors from create_hyc_download_stat' do + allow(Rails.logger).to receive(:error) + # Simulate a failure during the creation of a HycDownloadStat object for a specific file_id + allow(HycDownloadStat).to receive(:find_or_initialize_by).and_call_original + allow(HycDownloadStat).to receive(:find_or_initialize_by).with({:date=>"2023-03-01 00:00:00 UTC", :fileset_id=>"file_id_1"}).and_raise(StandardError, 'Simulated database query failure') + service.migrate_to_new_table(output_path) + expect(Rails.logger).to have_received(:error).with(a_string_including('Failed to create HycDownloadStat for')) + end + + it 'handles and logs errors from save_hyc_download_stat' do + allow(Rails.logger).to receive(:error) + # Simulate a failure during the creation of a HycDownloadStat object for a specific file_id + allow(HycDownloadStat).to receive(:find_or_initialize_by).and_call_original + allow(HycDownloadStat).to receive(:find_or_initialize_by).with({:date=>"2023-03-01 00:00:00 UTC", :fileset_id=>"file_id_1"}).and_raise(StandardError, 'Simulated database query failure') + service.migrate_to_new_table(output_path) + expect(Rails.logger).to have_received(:error).with(a_string_including('Failed to create HycDownloadStat for')) + end + end end + private def csv_to_hash_array(file_path) CSV.read(file_path, headers: true).map { |row| row.to_h.symbolize_keys } From 66229d73e6ee505f8e8dcf1d286279d4712a531f Mon Sep 17 00:00:00 2001 From: David Campbell Date: Wed, 28 Aug 2024 16:10:25 -0400 Subject: [PATCH 32/32] tests --- .../tasks/download_stats_migration_service_spec.rb | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb index 04ea74f5d..ff52faa25 100644 --- a/spec/services/tasks/download_stats_migration_service_spec.rb +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -114,6 +114,8 @@ service.migrate_to_new_table(output_path) end + after { HycDownloadStat.delete_all } + it 'creates new HycDownloadStat works from the CSV file' do csv_to_hash_array(output_path).each_with_index do |csv_row, index| work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(csv_row[:file_id]) @@ -140,18 +142,17 @@ allow(Rails.logger).to receive(:error) # Simulate a failure during the creation of a HycDownloadStat object for a specific file_id allow(HycDownloadStat).to receive(:find_or_initialize_by).and_call_original - allow(HycDownloadStat).to receive(:find_or_initialize_by).with({:date=>"2023-03-01 00:00:00 UTC", :fileset_id=>"file_id_1"}).and_raise(StandardError, 'Simulated database query failure') + allow(HycDownloadStat).to receive(:find_or_initialize_by).with({date: '2023-03-01 00:00:00 UTC', fileset_id: 'file_id_1'}).and_raise(StandardError, 'Simulated database query failure').once service.migrate_to_new_table(output_path) expect(Rails.logger).to have_received(:error).with(a_string_including('Failed to create HycDownloadStat for')) end it 'handles and logs errors from save_hyc_download_stat' do allow(Rails.logger).to receive(:error) - # Simulate a failure during the creation of a HycDownloadStat object for a specific file_id - allow(HycDownloadStat).to receive(:find_or_initialize_by).and_call_original - allow(HycDownloadStat).to receive(:find_or_initialize_by).with({:date=>"2023-03-01 00:00:00 UTC", :fileset_id=>"file_id_1"}).and_raise(StandardError, 'Simulated database query failure') + # Simulate a failure during the saving of a HycDownloadStat object for a specific file_id + allow_any_instance_of(HycDownloadStat).to receive(:new_record?).and_raise(StandardError, 'Simulated save failure') service.migrate_to_new_table(output_path) - expect(Rails.logger).to have_received(:error).with(a_string_including('Failed to create HycDownloadStat for')) + expect(Rails.logger).to have_received(:error).with(a_string_including('Error saving new row to HycDownloadStat')).at_least(1).times end end end