diff --git a/app/controllers/concerns/hyc/download_analytics_behavior.rb b/app/controllers/concerns/hyc/download_analytics_behavior.rb index cf144f3e4..f2c555b97 100644 --- a/app/controllers/concerns/hyc/download_analytics_behavior.rb +++ b/app/controllers/concerns/hyc/download_analytics_behavior.rb @@ -41,8 +41,8 @@ def track_download send_image: '0', ua: user_agent, # Recovering work id with a solr query - dimension1: record_id, - dimension2: record_title + dimension1: work_data[:work_id], + dimension2: work_data[:title] } uri.query = URI.encode_www_form(uri_params) response = HTTParty.get(uri.to_s) @@ -58,19 +58,16 @@ def track_download end def create_download_stat - record_id_value = record_id - work_type_value = work_type - admin_set_id_value = admin_set_id date = Date.today Rails.logger.debug('Creating or updating hyc-download-stat database entry with the following attributes:') - Rails.logger.debug("fileset_id: #{fileset_id}, work_id: #{record_id_value}, admin_set_id: #{admin_set_id_value}, work_type: #{work_type_value}, date: #{date.beginning_of_month}") + Rails.logger.debug("fileset_id: #{fileset_id}, work_id: #{work_data[:work_id]}, admin_set_id: #{work_data[:admin_set_id]}, work_type: #{work_data[:work_type]}, date: #{date.beginning_of_month}") stat = HycDownloadStat.find_or_initialize_by( fileset_id: fileset_id, - work_id: record_id_value, - admin_set_id: admin_set_id_value, - work_type: work_type_value, + work_id: work_data[:work_id], + admin_set_id: work_data[:admin_set_id], + work_type: work_data[:work_type], date: date.beginning_of_month ) stat.download_count += 1 @@ -87,36 +84,12 @@ def bot_request?(user_agent) browser.bot? end - def fetch_record - @record ||= ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'] - end - - def fetch_admin_set - @admin_set ||= ActiveFedora::SolrService.get("title_tesim:#{@admin_set_name}", rows: 1)['response']['docs'] - end - - def admin_set_id - @admin_set_id ||= fetch_admin_set.dig(0, 'id') || 'Unknown' - end - - def record_id - @record_id ||= fetch_record.dig(0, 'id') || 'Unknown' - end - - def work_type - @work_type ||= fetch_record.dig(0, 'has_model_ssim', 0) || 'Unknown' - end - def fileset_id @fileset_id ||= params[:id] || 'Unknown' end - def record_title - @record_title ||= if !fetch_record.blank? && fetch_record[0]['title_tesim'] - fetch_record[0]['title_tesim'].first - else - 'Unknown' - end + def work_data + @work_data ||= WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id) end def site_id diff --git a/app/helpers/work_utils_helper.rb b/app/helpers/work_utils_helper.rb new file mode 100644 index 000000000..a92119f63 --- /dev/null +++ b/app/helpers/work_utils_helper.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true +module WorkUtilsHelper + def self.fetch_work_data_by_fileset_id(fileset_id) + work = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {} + raise "No work found for fileset id: #{fileset_id}" if work.blank? + # Fetch the admin set related to the work + admin_set_name = work['admin_set_tesim']&.first || 'Unknown' + admin_set = ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", rows: 1)['response']['docs'].first || {} + + { + work_id: work['id'] || 'Unknown', + work_type: work.dig('has_model_ssim', 0) || 'Unknown', + title: work['title_tesim']&.first || 'Unknown', + admin_set_id: admin_set['id'] || 'Unknown', + admin_set_name: admin_set_name + } + end +end diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb new file mode 100644 index 000000000..c8865193f --- /dev/null +++ b/app/services/tasks/download_stats_migration_service.rb @@ -0,0 +1,139 @@ +# frozen_string_literal: true +module Tasks + class DownloadStatsMigrationService + PAGE_SIZE = 1000 + def list_work_stat_info(output_path, after_timestamp = nil) + begin + query = FileDownloadStat.all + query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? + total_work_stats = query.count + timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp' + + # Log number of work stats retrieved and timestamp clause + Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.") + + aggregated_data = {} + work_stats_retrieved_from_query_count = 0 + + Rails.logger.info('Retrieving work_stats from the database') + # Fetch the work_stats and aggregate them into monthly stats in Ruby, encountered issues with SQL queries + query.find_each(batch_size: PAGE_SIZE) do |stat| + truncated_date = stat.date.beginning_of_month + # Group the file_id and truncated date to be used as a key + key = [stat.file_id, truncated_date] + # Initialize the hash for the key if it doesn't exist + aggregated_data[key] ||= { file_id: stat.file_id, date: truncated_date, downloads: 0 } + # Sum the downloads for each key + aggregated_data[key][:downloads] += stat.downloads + work_stats_retrieved_from_query_count += 1 + log_progress(work_stats_retrieved_from_query_count, total_work_stats) + end + + aggregated_work_stats = aggregated_data.values + Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{total_work_stats} daily stats") + + # Write the work_stats to the specified CSV file + write_to_csv(output_path, aggregated_work_stats) + rescue StandardError => e + Rails.logger.error("An error occurred while listing work stats: #{e.message}") + Rails.logger.error(e.backtrace.join("\n")) + end + end + + def migrate_to_new_table(csv_path) + begin + csv_data = CSV.read(csv_path, headers: true) + csv_data_stats = csv_data.map { |row| row.to_h.symbolize_keys } + progress_tracker = { + all_categories: 0, + created: 0, + updated: 0, + skipped: 0, + failed: 0 + } + + Rails.logger.info("Migrating #{csv_data_stats.count} work stats to the new table.") + # Recreate or update objects in new table + csv_data_stats.each do |stat| + create_hyc_download_stat(stat, progress_tracker) + progress_tracker[:all_categories] += 1 + log_progress(progress_tracker[:all_categories], csv_data_stats.count, 'Migration') + end + Rails.logger.info("Migration complete: #{progress_tracker[:created]} created, #{progress_tracker[:updated]} updated, #{progress_tracker[:skipped]} skipped, #{progress_tracker[:failed]} failed") + rescue StandardError => e + Rails.logger.error("An error occurred while migrating work stats: #{e.message}") + Rails.logger.error(e.backtrace.join("\n")) + end + end + + private + + # Log progress at 25%, 50%, 75%, and 100% + def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval and Aggregation') + percentages = [0.25, 0.5, 0.75, 1.0] + log_intervals = percentages.map { |percent| (total_work_stats * percent).to_i } + if log_intervals.include?(work_stats_count) + percentage_done = percentages[log_intervals.index(work_stats_count)] * 100 + Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{work_stats_count}/#{total_work_stats} work_stats)") + end + end + + def create_hyc_download_stat(stat, progress_tracker) + begin + hyc_download_stat = HycDownloadStat.find_or_initialize_by( + fileset_id: stat[:file_id].to_s, + date: stat[:date] + ) + work_data = work_data_from_stat(stat) + hyc_download_stat.assign_attributes( + fileset_id: stat[:file_id], + work_id: work_data[:work_id], + admin_set_id: work_data[:admin_set_id], + work_type: work_data[:work_type], + date: stat[:date], + download_count: stat[:downloads], + ) + rescue StandardError => e + Rails.logger.error("Failed to create HycDownloadStat for #{stat.inspect}: #{e.message}") + progress_tracker[:failed] += 1 + end + save_hyc_download_stat(hyc_download_stat, stat, progress_tracker) + end + + # Similar implementation to work_data in DownloadAnalyticsBehavior + # Memoization is not necessary here since this method is called per stat + def work_data_from_stat(stat) + WorkUtilsHelper.fetch_work_data_by_fileset_id(stat[:file_id]) + end + + # Method to write work stats to a CSV file + def write_to_csv(output_path, work_stats, headers = ['file_id', 'date', 'downloads']) + puts "Inspect work_stats: #{work_stats.inspect}" + CSV.open(output_path, 'w', write_headers: true, headers: headers) do |csv| + work_stats.each do |stat| + csv << [stat[:file_id], stat[:date], stat[:downloads]] + end + end + Rails.logger.info("Work stats successfully written to #{output_path}") + end + + # Method to save the HycDownloadStat object and update the progress tracker + def save_hyc_download_stat(hyc_download_stat, stat, progress_tracker) + begin + if hyc_download_stat.new_record? + hyc_download_stat.save + progress_tracker[:created] += 1 + elsif hyc_download_stat.changed? + hyc_download_stat.save + progress_tracker[:updated] += 1 + else + progress_tracker[:skipped] += 1 + end + rescue StandardError => e + Rails.logger.error("Error saving new row to HycDownloadStat: #{stat.inspect}: #{e.message}") + progress_tracker[:failed] += 1 + end + end + + end +end diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake new file mode 100644 index 000000000..15edc556e --- /dev/null +++ b/lib/tasks/migrate_download_stats.rake @@ -0,0 +1,54 @@ +# frozen_string_literal: true +require 'time' +require 'optparse' +require 'optparse/date' + +namespace :migrate_download_stats do + desc 'output rows for download stat migration into a csv' + task :list_rows, [:output_dir, :after] => :environment do |_t, _args| + start_time = Time.now + puts "[#{start_time.utc.iso8601}] starting listing of work data" + options = {} + + opts = OptionParser.new + opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_rows -- [options]' + opts.on('-o', '--output-dir ARG', String, 'Directory list will be saved to') { |val| options[:output_dir] = val } + opts.on('-a', '--after ARG', String, 'List objects which have been updated after this timestamp') { |val| options[:after] = val } + args = opts.order!(ARGV) {} + opts.parse!(args) + + unless options[:output_dir].present? && options[:output_dir].end_with?('.csv') + puts 'Please provide a valid output directory with a .csv extension. Got ' + options[:output_dir].to_s + exit 1 + end + + migration_service = Tasks::DownloadStatsMigrationService.new + old_stats_csv = migration_service.list_work_stat_info(options[:output_dir], options[:after]) + puts "Listing completed in #{Time.now - start_time}s" + puts "Stored id list to file: #{options[:output_dir]}" + exit 0 + end + + desc 'migrate download stats to new table' + task :migrate, [:csv_path] => :environment do |_t, _args| + start_time = Time.now + puts "[#{start_time.utc.iso8601}] Starting migration from CSV to new table" + options = {} + + opts = OptionParser.new + opts.banner = 'Usage: bundle exec rake migrate_download_stats:migrate -- [options]' + opts.on('-c', '--csv-path ARG', String, 'Path to the CSV file to migrate') { |val| options[:csv_path] = val } + args = opts.order!(ARGV) {} + opts.parse!(args) + + unless options[:csv_path].present? && File.exist?(options[:csv_path]) + puts 'Please provide a valid CSV file path' + exit 1 + end + + migration_service = Tasks::DownloadStatsMigrationService.new + migration_service.migrate_to_new_table(options[:csv_path]) + puts "Migration completed in #{Time.now - start_time}s" + exit 0 + end +end diff --git a/spec/controllers/hyrax/downloads_controller_spec.rb b/spec/controllers/hyrax/downloads_controller_spec.rb index e80aa4c37..7720af709 100644 --- a/spec/controllers/hyrax/downloads_controller_spec.rb +++ b/spec/controllers/hyrax/downloads_controller_spec.rb @@ -29,6 +29,14 @@ 'admin_set_tesim' => ['Open_Access_Articles_and_Book_Chapters']} ] } + let(:mock_work_data) { { + work_id: '1z40m031g', + work_type: 'Article', + title: ['Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response'], + admin_set_id: 'h128zk07m', + admin_set_name: 'Open_Access_Articles_and_Book_Chapters' + } + } let(:file_set) do FactoryBot.create(:file_with_work, user: user, content: File.open("#{fixture_path}/files/image.png")) end @@ -53,8 +61,7 @@ allow(stub_matomo) @user = user sign_in @user - allow(controller).to receive(:fetch_record).and_return(mock_record) - allow(controller).to receive(:fetch_admin_set).and_return(mock_admin_set) + allow(WorkUtilsHelper).to receive(:fetch_work_data_by_fileset_id).and_return(mock_work_data) allow(Hyrax::Analytics.config).to receive(:site_id).and_return(spec_site_id) allow(SecureRandom).to receive(:uuid).and_return('555') allow(Hyrax::VirusCheckerService).to receive(:file_has_virus?) { false } @@ -201,8 +208,14 @@ context 'fileset without a parent work' do before do - allow(controller).to receive(:fetch_record).and_return([{}]) - allow(controller).to receive(:fetch_admin_set).and_return([{}]) + dummy_work_data = { + work_id: 'Unknown', + work_type: 'Unknown', + title: 'Unknown', + admin_set_id: 'Unknown', + admin_set_name: 'Unknown' + } + allow(WorkUtilsHelper).to receive(:fetch_work_data_by_fileset_id).and_return(dummy_work_data) end it 'records a download event with no work type' do @@ -332,58 +345,6 @@ end end - describe '#fetch_record' do - it 'fetches the record from Solr' do - expect(controller.send(:fetch_record)).to eq(mock_record) - end - end - - describe '#fetch_admin_set' do - it 'fetches the admin set from Solr' do - expect(controller.send(:fetch_admin_set)).to eq(mock_admin_set) - end - end - - describe '#admin_set_id' do - it 'returns the admin set id' do - expect(controller.send(:admin_set_id)).to eq('h128zk07m') - end - end - - describe '#record_id' do - it 'returns the record id' do - expect(controller.send(:record_id)).to eq('1z40m031g') - end - - it 'returns Unknown if the record is blank' do - allow(controller).to receive(:fetch_record).and_return([]) - expect(controller.send(:record_id)).to eq('Unknown') - end - end - - describe '#fileset_id' do - it 'returns the fileset id from params' do - controller.params = { id: file_set.id } - expect(controller.send(:fileset_id)).to eq(file_set.id) - end - - it 'returns Unknown if params id is missing' do - controller.params = {} - expect(controller.send(:fileset_id)).to eq('Unknown') - end - end - - describe '#record_title' do - it 'returns the record title' do - expect(controller.send(:record_title)).to eq('Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response') - end - - it 'returns Unknown if the record title is blank' do - allow(controller).to receive(:fetch_record).and_return([{ 'title_tesim' => nil }]) - expect(controller.send(:record_title)).to eq('Unknown') - end - end - describe '#site_id' do it 'returns the site id from ENV' do expect(controller.send(:site_id)).to eq('5') diff --git a/spec/factories/file_download_stats.rb b/spec/factories/file_download_stats.rb new file mode 100644 index 000000000..8ad1892e5 --- /dev/null +++ b/spec/factories/file_download_stats.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true +FactoryBot.define do + factory :file_download_stat do + sequence(:id) { |n| n } # Auto-incrementing ID + date { FFaker::Time.between(Date.new(2019, 1, 1), Date.new(2022, 1, 31)) } # Random date between a range + downloads { rand(1..50) } + sequence(:file_id) { |n| "file_id_#{n}" } # Unique file ID for each record + created_at { date } + updated_at { date } + user_id { rand(1..100) } + end +end diff --git a/spec/factories/solr_query_result.rb b/spec/factories/solr_query_result.rb new file mode 100644 index 000000000..b718f4e95 --- /dev/null +++ b/spec/factories/solr_query_result.rb @@ -0,0 +1,31 @@ +# spec/factories/solr_query_result.rb +# frozen_string_literal: true + +# Helper method to generate work or admin IDs like '0001abc', '0002abc', etc. +def generate_id(n) + "#{n.to_s.rjust(4, '0')}abc" +end + +# Factory for creating Solr query results +FactoryBot.define do + factory :solr_query_result, class: OpenStruct do + trait :work do + # Default values for has_model_ssim, admin_set_tesim, and file_set_ids_ssim + has_model_ssim { ['Article'] } + admin_set_tesim { ['Open_Access_Articles_and_Book_Chapters'] } + file_set_ids_ssim { ['file_set_id'] } + sequence(:id) { |n| generate_id(n) } + sequence(:title_tesim) { |n| ["Test Title #{n}"] } + end + + trait :admin_set do + # Default values for has_model_ssim and title_tesim + has_model_ssim { ['AdminSet'] } + title_tesim { ['Open_Access_Articles_and_Book_Chapters'] } + sequence(:id) { |n| generate_id(n) } + end + + # Override the default save behavior to do nothing since it's a non-ActiveRecord object + to_create { |instance| instance } + end +end diff --git a/spec/helpers/hyrax/work_utils_helper_spec.rb b/spec/helpers/hyrax/work_utils_helper_spec.rb new file mode 100644 index 000000000..f2e9c62d8 --- /dev/null +++ b/spec/helpers/hyrax/work_utils_helper_spec.rb @@ -0,0 +1,79 @@ +# frozen_string_literal: true +require 'rails_helper' +require Rails.root.join('app/overrides/controllers/hydra/controller/download_behavior_override.rb') +require Rails.root.join('app/overrides/controllers/hyrax/downloads_controller_override.rb') + +RSpec.describe WorkUtilsHelper, type: :module do + let(:fileset_id) { 'file-set-id' } + let(:admin_set_name) { 'Open_Access_Articles_and_Book_Chapters' } + let(:example_admin_set_id) { 'h128zk07m' } + let(:example_work_id) { '1z40m031g' } + + let(:mock_record) { [{ + 'has_model_ssim' => ['Article'], + 'id' => '1z40m031g', + 'title_tesim' => ['Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response'], + 'admin_set_tesim' => ['Open_Access_Articles_and_Book_Chapters']} + ] + } + + let(:mock_admin_set) { [{ + 'has_model_ssim' => ['AdminSet'], + 'id' => 'h128zk07m', + 'title_tesim' => ['Open_Access_Articles_and_Book_Chapters']} + ] + } + + let(:expected_work_data) { { + work_id: '1z40m031g', + work_type: 'Article', + title: 'Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response', + admin_set_id: 'h128zk07m', + admin_set_name: 'Open_Access_Articles_and_Book_Chapters' + } + } + + before do + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_id}", rows: 1).and_return('response' => { 'docs' => mock_record }) + allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_name}", rows: 1).and_return('response' => { 'docs' => mock_admin_set }) + end + + describe '#fetch_work_data_by_fileset_id' do + it 'fetches the work data correctly' do + result = WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id) + expect(result).to eq(expected_work_data) + end + + it 'properly substitutes Unknown for missing values' do + # Mock the solr response to simulate a work with missing values, if it somehow makes it past the initial nil check + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_id}", rows: 1).and_return('response' => { 'docs' => [{ 'placeholder-key' => 'placeholder-value' }] }) + allow(ActiveFedora::SolrService).to receive(:get).with('title_tesim:Unknown', rows: 1).and_return('response' => { 'docs' => [] }) + result = WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id) + expect(result[:work_id]).to eq('Unknown') + expect(result[:work_type]).to eq('Unknown') + expect(result[:title]).to eq('Unknown') + expect(result[:admin_set_id]).to eq('Unknown') + end + + context 'when no work is found' do + before do + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_id}", rows: 1).and_return('response' => { 'docs' => [] }) + end + + it 'raises an error if no work is found' do + expect { WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id) }.to raise_error(RuntimeError, "No work found for fileset id: #{fileset_id}") + end + end + + context 'when admin set is not found' do + before do + allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_name}", rows: 1).and_return('response' => { 'docs' => [] }) + end + + it 'sets the admin_set_id to Unknown if admin set is not found' do + result = WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id) + expect(result[:admin_set_id]).to eq('Unknown') + end + end + end +end diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb new file mode 100644 index 000000000..ff52faa25 --- /dev/null +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -0,0 +1,177 @@ +# frozen_string_literal: true +require 'rails_helper' + +RSpec.describe Tasks::DownloadStatsMigrationService, type: :service do + let(:admin_set_title) { 'Open_Access_Articles_and_Book_Chapters' } + let(:mock_admin_set) { FactoryBot.create(:solr_query_result, :admin_set, title_tesim: [admin_set_title]) } + let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') } + let(:service) { described_class.new } + + before do + allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_title}", rows: 1).and_return('response' => { 'docs' => [mock_admin_set] }) + end + + after do + # Ensure the output file is removed after each test + File.delete(output_path) if File.exist?(output_path) + end + + # Smaller groups to allow for easier testing for aggregation of download stats from daily to monthly + let(:file_download_stats) { [[ + FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 15), downloads: 5, file_id: 'file_id_1'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 1, 30), downloads: 5, file_id: 'file_id_1'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 15), downloads: 10, file_id: 'file_id_1'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 3, 30), downloads: 10, file_id: 'file_id_1'), + ], + [ + FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 15), downloads: 25, file_id: 'file_id_2'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 4, 30), downloads: 25, file_id: 'file_id_2'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 15), downloads: 50, file_id: 'file_id_2'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 5, 30), downloads: 50, file_id: 'file_id_2'), + ], + [ + FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 15), downloads: 100, file_id: 'file_id_3'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 6, 30), downloads: 100, file_id: 'file_id_3'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 15), downloads: 150, file_id: 'file_id_3'), + FactoryBot.create(:file_download_stat, date: Date.new(2023, 7, 30), downloads: 150, file_id: 'file_id_3'), + ]] + } + + # Create a hash of [fileset_id, date.beginning_of_month] => download count for each file_download_stats + let(:expected_aggregated_download_count) do + file_download_stats.flatten.each_with_object(Hash.new(0)) do |stat, hash| + hash[[stat.file_id, stat.date.beginning_of_month.to_datetime]] += stat.downloads + end + end + + let(:mock_works) do + file_download_stats.flatten.map do |stat| + FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) + end + end + + describe '#list_work_stat_info' do + it 'writes all works to the output CSV file' do + file_download_stats.flatten.each_with_index do |stat, index| + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] }) + end + + expected_works = [ + { file_id: 'file_id_1', date: '2023-01-01 00:00:00 UTC', downloads: '10' }, + { file_id: 'file_id_1', date: '2023-03-01 00:00:00 UTC', downloads: '20' }, + { file_id: 'file_id_2', date: '2023-04-01 00:00:00 UTC', downloads: '50' }, + { file_id: 'file_id_2', date: '2023-05-01 00:00:00 UTC', downloads: '100' }, + { file_id: 'file_id_3', date: '2023-06-01 00:00:00 UTC', downloads: '200' }, + { file_id: 'file_id_3', date: '2023-07-01 00:00:00 UTC', downloads: '300' } + ] + service.list_work_stat_info(output_path, nil) + + expect(File).to exist(output_path) + expect(csv_to_hash_array(output_path)).to match_array(expected_works) + end + + it 'handles and logs errors' do + allow(Rails.logger).to receive(:error) + allow(FileDownloadStat).to receive(:all).and_raise(StandardError, 'Simulated database query failure') + service.list_work_stat_info(output_path, nil) + expect(Rails.logger).to have_received(:error).with('An error occurred while listing work stats: Simulated database query failure') + end + + context 'with an after_timestamp' do + let(:recent_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: '2023-05-05 00:00:00 UTC') } + let(:old_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: '2023-04-05 00:00:00 UTC') } + let(:recent_stat_file_ids) { recent_stats.map(&:file_id) } + let(:old_stat_file_ids) { old_stats.map(&:file_id) } + + before do + all_stats = recent_stats + old_stats + all_works = all_stats.map do |stat| + FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) + end + all_stats.each_with_index do |stat, index| + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [all_works[index]] }) + end + end + + it 'filters works by the given timestamp' do + # Retrieve works created after 'updated_at' date for old stats + service.list_work_stat_info(output_path, '2023-04-06 00:00:00 UTC') + puts "CSV data: #{csv_to_hash_array(output_path).inspect}" + + expect(File).to exist(output_path) + expect(csv_to_hash_array(output_path).map { |work| work[:file_id] }).to match_array(recent_stat_file_ids) + expect(csv_to_hash_array(output_path).map { |work| work[:file_id] }).not_to include(*old_stat_file_ids) + end + end + end + + describe '#migrate_to_new_table' do + before do + file_download_stats.flatten.each_with_index do |stat, index| + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] }) + end + service.list_work_stat_info(output_path, nil) + service.migrate_to_new_table(output_path) + end + + after { HycDownloadStat.delete_all } + + it 'creates new HycDownloadStat works from the CSV file' do + csv_to_hash_array(output_path).each_with_index do |csv_row, index| + work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(csv_row[:file_id]) + csv_row_date = Date.parse(csv_row[:date]).beginning_of_month + hyc_download_stat = HycDownloadStat.find_by(fileset_id: csv_row[:file_id], date: csv_row_date) + + expect(hyc_download_stat).to be_present + expect(hyc_download_stat.fileset_id).to eq(csv_row[:file_id]) + expect(hyc_download_stat.work_id).to eq(work_data[:work_id]) + expect(hyc_download_stat.date).to eq(csv_row[:date].to_date) + expect(hyc_download_stat.download_count).to eq(expected_aggregated_download_count[[csv_row[:file_id], csv_row_date]]) + end + end + + it 'handles and logs errors' do + allow(CSV).to receive(:read).and_raise(StandardError, 'Simulated CSV read failure') + allow(Rails.logger).to receive(:error) + service.migrate_to_new_table(output_path) + expect(Rails.logger).to have_received(:error).with('An error occurred while migrating work stats: Simulated CSV read failure') + end + + context 'if a failure occurs during a private function' do + it 'handles and logs errors from create_hyc_download_stat' do + allow(Rails.logger).to receive(:error) + # Simulate a failure during the creation of a HycDownloadStat object for a specific file_id + allow(HycDownloadStat).to receive(:find_or_initialize_by).and_call_original + allow(HycDownloadStat).to receive(:find_or_initialize_by).with({date: '2023-03-01 00:00:00 UTC', fileset_id: 'file_id_1'}).and_raise(StandardError, 'Simulated database query failure').once + service.migrate_to_new_table(output_path) + expect(Rails.logger).to have_received(:error).with(a_string_including('Failed to create HycDownloadStat for')) + end + + it 'handles and logs errors from save_hyc_download_stat' do + allow(Rails.logger).to receive(:error) + # Simulate a failure during the saving of a HycDownloadStat object for a specific file_id + allow_any_instance_of(HycDownloadStat).to receive(:new_record?).and_raise(StandardError, 'Simulated save failure') + service.migrate_to_new_table(output_path) + expect(Rails.logger).to have_received(:error).with(a_string_including('Error saving new row to HycDownloadStat')).at_least(1).times + end + end + end + + + private + def csv_to_hash_array(file_path) + CSV.read(file_path, headers: true).map { |row| row.to_h.symbolize_keys } + end + + # Helper method to convert an array of FileDownloadStat objects to an array of hashes + # Checks for truncated date to the beginning of the month + def expected_works_for(stats) + stats.map do |stat| + { + file_id: stat.file_id, + date: stat.date.beginning_of_month.to_s, + downloads: stat.downloads.to_s, + } + end + end +end