diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f2a2c4aed..8e3690c4d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -138,12 +138,12 @@ jobs: POSTGRES_PASSWORD: password TMPDIR: /tmp - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 with: name: test-coverage path: coverage - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 with: name: test-coverage-report path: coverage/coverage.json diff --git a/app/controllers/concerns/hyc/download_analytics_behavior.rb b/app/controllers/concerns/hyc/download_analytics_behavior.rb index f2c555b97..559b0adfd 100644 --- a/app/controllers/concerns/hyc/download_analytics_behavior.rb +++ b/app/controllers/concerns/hyc/download_analytics_behavior.rb @@ -18,16 +18,16 @@ def track_download client_ip = request.remote_ip user_agent = request.user_agent - matomo_id_site = site_id - matomo_security_token = auth_token - uri = URI("#{base_url}/matomo.php") + matomo_site_id = ENV['MATOMO_SITE_ID'] + matomo_security_token = ENV['MATOMO_AUTH_TOKEN'] + tracking_uri = URI("#{ENV['MATOMO_BASE_URL']}/matomo.php") # Some parameters are optional, but included since tracking would not work otherwise # https://developer.matomo.org/api-reference/tracking-api uri_params = { token_auth: matomo_security_token, rec: '1', - idsite: matomo_id_site, + idsite: matomo_site_id, action_name: 'Download', url: request.url, urlref: request.referrer, @@ -44,11 +44,11 @@ def track_download dimension1: work_data[:work_id], dimension2: work_data[:title] } - uri.query = URI.encode_www_form(uri_params) - response = HTTParty.get(uri.to_s) - Rails.logger.debug("Matomo download tracking URL: #{uri}") + tracking_uri.query = URI.encode_www_form(uri_params) + response = HTTParty.get(tracking_uri.to_s) + Rails.logger.debug("Matomo download tracking URL: #{tracking_uri}") if response.code >= 300 - Rails.logger.error("DownloadAnalyticsBehavior received an error response #{response.code} for matomo query: #{uri}") + Rails.logger.error("DownloadAnalyticsBehavior received an error response #{response.code} for matomo query: #{tracking_uri}") end # Send download events to db create_download_stat @@ -92,18 +92,6 @@ def work_data @work_data ||= WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id) end - def site_id - @site_id ||= ENV['MATOMO_SITE_ID'] - end - - def auth_token - @auth_token ||= ENV['MATOMO_AUTH_TOKEN'] - end - - def base_url - @base_url ||= ENV['MATOMO_BASE_URL'] - end - def client_id cookie = cookies.find { |key, _| key.start_with?('_pk_id') }&.last if cookie.present? diff --git a/app/helpers/work_utils_helper.rb b/app/helpers/work_utils_helper.rb index 1650a766f..9bb7d6fd4 100644 --- a/app/helpers/work_utils_helper.rb +++ b/app/helpers/work_utils_helper.rb @@ -1,20 +1,21 @@ # frozen_string_literal: true module WorkUtilsHelper def self.fetch_work_data_by_fileset_id(fileset_id) - work = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {} - Rails.logger.warn("No work found for fileset id: #{fileset_id}") if work.blank? - # Fetch the admin set related to the work - admin_set_name = work['admin_set_tesim']&.first - # If the admin set name is not nil, fetch the admin set + # Retrieve the fileset data + fileset_data = ActiveFedora::SolrService.get("id:#{fileset_id}", rows: 1)['response']['docs'].first || {} + Rails.logger.warn("No fileset data found for fileset id: #{fileset_id}") if fileset_data.blank? + # Retrieve the work related to the fileset + work_data = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {} + Rails.logger.warn("No work found associated with fileset id: #{fileset_id}") if work_data.blank? # Set the admin set to an empty hash if the solr query returns nil - admin_set = admin_set_name ? ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", { :rows => 1, 'df' => 'title_tesim'})['response']['docs'].first || {} : {} - Rails.logger.warn(self.generate_warning_message(admin_set_name, fileset_id)) if admin_set.blank? - + admin_set_name = work_data['admin_set_tesim']&.first + admin_set_data = admin_set_name ? ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", { :rows => 1, 'df' => 'title_tesim'})['response']['docs'].first : {} + Rails.logger.warn(self.generate_warning_message(admin_set_name, fileset_id)) if admin_set_data.blank? { - work_id: work['id'], - work_type: work.dig('has_model_ssim', 0), - title: work['title_tesim']&.first, - admin_set_id: admin_set['id'], + work_id: fileset_data['id'], + work_type: work_data.dig('has_model_ssim', 0), + title: work_data['title_tesim']&.first, + admin_set_id: admin_set_data['id'], admin_set_name: admin_set_name } end diff --git a/app/services/tasks/dimensions_query_service.rb b/app/services/tasks/dimensions_query_service.rb index 22c72915c..39fcedcab 100644 --- a/app/services/tasks/dimensions_query_service.rb +++ b/app/services/tasks/dimensions_query_service.rb @@ -195,7 +195,7 @@ def solr_query_builder(pub) # Query with paramaters to retrieve publications related to UNC def generate_query_string(start_date, end_date, page_size, cursor) - search_clauses = ['where type = "article"', "date >= \"#{start_date}\"", "date < \"#{end_date}\""].join(' and ') + search_clauses = ['where type = "article"', "date_inserted >= \"#{start_date}\"", "date_inserted < \"#{end_date}\""].join(' and ') return_fields = ['basics', 'extras', 'abstract', 'issn', 'publisher', 'journal_title_raw', 'linkout', 'concepts'].join(' + ') unc_affiliation_variants = ['"UNC-CH"', '"University of North Carolina at Chapel Hill"', '"UNC-Chapel Hill"', '"University of North Carolina-Chapel Hill"', '"University of North Carolina, Chapel Hill"'].join(' OR ') <<~QUERY diff --git a/app/services/tasks/download_stats_migration_service.rb b/app/services/tasks/download_stats_migration_service.rb index d1cdac068..1d80dcb0f 100644 --- a/app/services/tasks/download_stats_migration_service.rb +++ b/app/services/tasks/download_stats_migration_service.rb @@ -2,38 +2,32 @@ module Tasks class DownloadStatsMigrationService PAGE_SIZE = 1000 - def list_work_stat_info(output_path, after_timestamp = nil) - begin - query = FileDownloadStat.all - query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? - total_work_stats = query.count - timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp' - - # Log number of work stats retrieved and timestamp clause - Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.") - - aggregated_data = {} - work_stats_retrieved_from_query_count = 0 - - Rails.logger.info('Retrieving work_stats from the database') - # Fetch the work_stats and aggregate them into monthly stats in Ruby, encountered issues with SQL queries - query.find_each(batch_size: PAGE_SIZE) do |stat| - truncated_date = stat.date.beginning_of_month - # Group the file_id and truncated date to be used as a key - key = [stat.file_id, truncated_date] - # Initialize the hash for the key if it doesn't exist - aggregated_data[key] ||= { file_id: stat.file_id, date: truncated_date, downloads: 0 } - # Sum the downloads for each key - aggregated_data[key][:downloads] += stat.downloads - work_stats_retrieved_from_query_count += 1 - log_progress(work_stats_retrieved_from_query_count, total_work_stats) - end + module DownloadMigrationSource + MATOMO = :matomo + GA4 = :ga4 + CACHE = :cache - aggregated_work_stats = aggregated_data.values - Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{total_work_stats} daily stats") + def self.all_sources + [MATOMO, GA4, CACHE] + end - # Write the work_stats to the specified CSV file - write_to_csv(output_path, aggregated_work_stats) + def self.valid?(source) + all_sources.include?(source) + end + end + def list_work_stat_info(output_path, after_timestamp = nil, before_timestamp = nil, source) + aggregated_work_stats = [] + begin + case source + when DownloadMigrationSource::CACHE + aggregated_work_stats = fetch_local_cache_stats(after_timestamp, output_path) + write_to_csv(output_path, aggregated_work_stats) + when DownloadMigrationSource::MATOMO + aggregated_work_stats = fetch_matomo_stats(after_timestamp, before_timestamp, output_path) + write_to_csv(output_path, aggregated_work_stats) + else + raise ArgumentError, "Unsupported source: #{source}" + end rescue StandardError => e Rails.logger.error("An error occurred while listing work stats: #{e.message}") Rails.logger.error(e.backtrace.join("\n")) @@ -68,6 +62,89 @@ def migrate_to_new_table(csv_path) private + # Method to fetch and aggregate work stats from Matomo + def fetch_matomo_stats(after_timestamp, before_timestamp, output_path) + aggregated_data = {} + # Keeps count of stats retrieved from Matomo from all queries + all_query_stat_total = 0 + # Log number of work stats retrieved and timestamp clause + timestamp_clause = "in specified range #{after_timestamp} to #{before_timestamp}" + Rails.logger.info("Fetching work stats #{timestamp_clause} from Matomo.") + + # Query Matomo API for each month in the range and aggregate the data + # Setting period to month will return stats for each month in the range, regardless of the specified date + reporting_uri = URI("#{ENV['MATOMO_BASE_URL']}/index.php") + # Fetch the first of each month in the range + months_array = first_of_each_month_in_range(after_timestamp, before_timestamp) + months_array.each_with_index do |first_date_of_month, index| + uri_params = { + module: 'API', + idSite: ENV['MATOMO_SITE_ID'], + method: 'Events.getName', + period: 'month', + date: first_date_of_month, + format: JSON, + token_auth: ENV['MATOMO_AUTH_TOKEN'], + flat: '1', + filter_pattern: 'DownloadIR', + filter_limit: -1, + showColumns: 'nb_events', + } + reporting_uri.query = URI.encode_www_form(uri_params) + response = HTTParty.get(reporting_uri.to_s) + month_year_string = first_date_of_month.to_date.strftime('%B %Y') + Rails.logger.info("Processing Matomo response for #{month_year_string}. (#{index + 1}/#{months_array.count})") + response.parsed_response.each do |stat| + # Events_EventName is the file_id, nb_events is the number of downloads + update_aggregate_stats(aggregated_data, first_date_of_month, stat['Events_EventName'], stat['nb_events']) + end + monthly_stat_total = response.parsed_response.length + all_query_stat_total += monthly_stat_total + end + Rails.logger.info("Aggregated #{aggregated_data.values.count} monthly stats from #{all_query_stat_total} total retrieved stats") + # Return the aggregated data + aggregated_data.values + end + + def update_aggregate_stats(aggregated_data, truncated_date, file_id, downloads) + # Group the file_id and truncated date to be used as a key + key = [file_id, truncated_date] + # Initialize the hash for the key if it doesn't exist + aggregated_data[key] ||= { file_id: file_id, date: truncated_date, downloads: 0 } + # Sum the downloads for each key + aggregated_data[key][:downloads] += downloads + end + + def first_of_each_month_in_range(after_timestamp, before_timestamp) + after_date = after_timestamp.to_date.beginning_of_month + before_date = before_timestamp.to_date.beginning_of_month + (after_date..before_date).select { |d| d.day == 1 }.map(&:to_s) + end + + # Method to fetch and aggregate work stats from the local cache + def fetch_local_cache_stats(after_timestamp, output_path) + aggregated_data = {} + work_stats_retrieved_from_query_count = 0 + query = FileDownloadStat.all + query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? + total_work_stats = query.count + timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp' + + # Log number of work stats retrieved and timestamp clause + Rails.logger.info("Fetching #{total_work_stats} work stats #{timestamp_clause} from the hyrax local cache.") + + # Fetch the work_stats and aggregate them into monthly stats in Ruby, encountered issues with SQL queries + query.find_each(batch_size: PAGE_SIZE) do |stat| + update_aggregate_stats(aggregated_data, stat.date.beginning_of_month, stat.file_id, stat.downloads) + work_stats_retrieved_from_query_count += 1 + log_progress(work_stats_retrieved_from_query_count, total_work_stats) + end + + Rails.logger.info("Aggregated #{aggregated_data.values.count} monthly stats from #{total_work_stats} daily stats") + # Return the aggregated data + aggregated_data.values + end + # Log progress at 25%, 50%, 75%, and 100% def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval and Aggregation') percentages = [0.25, 0.5, 0.75, 1.0] diff --git a/lib/tasks/migrate_download_stats.rake b/lib/tasks/migrate_download_stats.rake index 15edc556e..e42c4945b 100644 --- a/lib/tasks/migrate_download_stats.rake +++ b/lib/tasks/migrate_download_stats.rake @@ -5,7 +5,7 @@ require 'optparse/date' namespace :migrate_download_stats do desc 'output rows for download stat migration into a csv' - task :list_rows, [:output_dir, :after] => :environment do |_t, _args| + task :list_rows, [:output_dir, :after, :before, :source] => :environment do |_t, _args| start_time = Time.now puts "[#{start_time.utc.iso8601}] starting listing of work data" options = {} @@ -14,6 +14,8 @@ namespace :migrate_download_stats do opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_rows -- [options]' opts.on('-o', '--output-dir ARG', String, 'Directory list will be saved to') { |val| options[:output_dir] = val } opts.on('-a', '--after ARG', String, 'List objects which have been updated after this timestamp') { |val| options[:after] = val } + opts.on('-b', '--before ARG', String, 'List objects updated before this timestamp, only meant for matomo and ga4 migrations') { |val| options[:before] = val } + opts.on('-s', '--source ARG', String, 'Data source (matomo, ga4, cache)') { |val| options[:source] = val.to_sym } args = opts.order!(ARGV) {} opts.parse!(args) @@ -22,8 +24,20 @@ namespace :migrate_download_stats do exit 1 end + unless Tasks::DownloadStatsMigrationService::DownloadMigrationSource.valid?(options[:source]) + puts "Please provide a valid source: #{Tasks::DownloadStatsMigrationService::DownloadMigrationSource.all_sources.join(', ')}" + exit 1 + end + + # Require both 'before' and 'after' arguments if the source is not 'cache' + if options[:source] != Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE && (!options[:before].present? || !options[:after].present?) + puts "Both 'before' and 'after' timestamps are required for sources other than #{Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE}" + exit 1 + end + + migration_service = Tasks::DownloadStatsMigrationService.new - old_stats_csv = migration_service.list_work_stat_info(options[:output_dir], options[:after]) + old_stats_csv = migration_service.list_work_stat_info(options[:output_dir], options[:after], options[:before], options[:source]) puts "Listing completed in #{Time.now - start_time}s" puts "Stored id list to file: #{options[:output_dir]}" exit 0 diff --git a/spec/controllers/hyrax/downloads_controller_spec.rb b/spec/controllers/hyrax/downloads_controller_spec.rb index 7720af709..1bf2ad587 100644 --- a/spec/controllers/hyrax/downloads_controller_spec.rb +++ b/spec/controllers/hyrax/downloads_controller_spec.rb @@ -345,9 +345,4 @@ end end - describe '#site_id' do - it 'returns the site id from ENV' do - expect(controller.send(:site_id)).to eq('5') - end - end end diff --git a/spec/fixtures/files/matomo_stats_migration_fixture.json b/spec/fixtures/files/matomo_stats_migration_fixture.json new file mode 100644 index 000000000..9889da74d --- /dev/null +++ b/spec/fixtures/files/matomo_stats_migration_fixture.json @@ -0,0 +1,80 @@ +{ + "2024-01-01": [ + { + "label": "file_id_1 - DownloadIR", + "nb_events": 120, + "Events_EventName": "file_id_1", + "Events_EventAction": "DownloadIR" + }, + { + "label": "file_id_1 - DownloadIR", + "nb_events": 70, + "Events_EventName": "file_id_1", + "Events_EventAction": "DownloadIR" + }, + { + "label": "file_id_2 - DownloadIR", + "nb_events": 100, + "Events_EventName": "file_id_2", + "Events_EventAction": "DownloadIR" + }, + { + "label": "file_id_2 - DownloadIR", + "nb_events": 50, + "Events_EventName": "file_id_2", + "Events_EventAction": "DownloadIR" + }] + , + "2024-02-01": [ + { + "label": "file_id_3 - DownloadIR", + "nb_events": 10, + "Events_EventName": "file_id_3", + "Events_EventAction": "DownloadIR" + }, + { + "label": "file_id_3 - DownloadIR", + "nb_events": 90, + "Events_EventName": "file_id_3", + "Events_EventAction": "DownloadIR" + }, + { + "label": "file_id_4 - DownloadIR", + "nb_events": 50, + "Events_EventName": "file_id_4", + "Events_EventAction": "DownloadIR" + }, + { + "label": "file_id_4 - DownloadIR", + "nb_events": 30, + "Events_EventName": "file_id_4", + "Events_EventAction": "DownloadIR" + } + ], + "2024-03-01": [ + { + "label": "file_id_5 - DownloadIR", + "nb_events": 80, + "Events_EventName": "file_id_5", + "Events_EventAction": "DownloadIR" + }, + { + "label": "file_id_5 - DownloadIR", + "nb_events": 100, + "Events_EventName": "file_id_5", + "Events_EventAction": "DownloadIR" + }, + { + "label": "file_id_6 - DownloadIR", + "nb_events": 250, + "Events_EventName": "file_id_6", + "Events_EventAction": "DownloadIR" + }, + { + "label": "file_id_6 - DownloadIR", + "nb_events": 300, + "Events_EventName": "file_id_6", + "Events_EventAction": "DownloadIR" + } + ] +} \ No newline at end of file diff --git a/spec/helpers/hyrax/work_utils_helper_spec.rb b/spec/helpers/hyrax/work_utils_helper_spec.rb index 8774c9c23..f847af631 100644 --- a/spec/helpers/hyrax/work_utils_helper_spec.rb +++ b/spec/helpers/hyrax/work_utils_helper_spec.rb @@ -38,23 +38,22 @@ } } - before do - allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_ids[0]}", rows: 1).and_return('response' => { 'docs' => mock_records[0] }) - allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_name}", {'df'=>'title_tesim', :rows=>1}).and_return('response' => { 'docs' => mock_admin_set }) - end - describe '#fetch_work_data_by_fileset_id' do it 'fetches the work data correctly' do + allow(ActiveFedora::SolrService).to receive(:get).with("id:#{fileset_ids[0]}", rows: 1).and_return('response' => { 'docs' => mock_records[0] }) + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_ids[0]}", rows: 1).and_return('response' => { 'docs' => mock_records[0] }) + allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_name}", {'df'=>'title_tesim', :rows=>1}).and_return('response' => { 'docs' => mock_admin_set }) result = WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_ids[0]) expect(result).to eq(expected_work_data) end it 'logs appropriate messages for missing values' do # Mock the solr response to simulate a work with missing values, if it somehow makes it past the initial nil check + allow(ActiveFedora::SolrService).to receive(:get).with("id:#{fileset_ids[0]}", rows: 1).and_return('response' => { 'docs' => [] }) allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_ids[0]}", rows: 1).and_return('response' => { 'docs' => [] }) allow(Rails.logger).to receive(:warn) result = WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_ids[0]) - expect(Rails.logger).to have_received(:warn).with("No work found for fileset id: #{fileset_ids[0]}") + expect(Rails.logger).to have_received(:warn).with("No fileset data found for fileset id: #{fileset_ids[0]}") expect(Rails.logger).to have_received(:warn).with("Could not find an admin set, the work with fileset id: #{fileset_ids[0]} has no admin set name.") expect(result[:work_id]).to be_nil expect(result[:work_type]).to be_nil @@ -62,18 +61,10 @@ expect(result[:admin_set_id]).to be_nil end - context 'when no work is found' do - it 'logs a warning if no work is found' do - allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_ids[1]}", rows: 1).and_return('response' => { 'docs' => [] }) - allow(Rails.logger).to receive(:warn) - WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_ids[1]) - expect(Rails.logger).to have_received(:warn).with("No work found for fileset id: #{fileset_ids[1]}") - end - end - context 'when admin set is not found' do it 'logs an appropriate message if the work doesnt have an admin set title' do # Using the mock record without an admin set title + allow(ActiveFedora::SolrService).to receive(:get).with("id:#{fileset_ids[1]}", rows: 1).and_return('response' => { 'docs' => mock_records[1] }) allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_ids[1]}", rows: 1).and_return('response' => { 'docs' => mock_records[1] }) allow(Rails.logger).to receive(:warn) result = WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_ids[1]) @@ -83,8 +74,9 @@ it 'logs an appropriate message if the query for an admin set returns nothing' do # Using the mock record with an admin set title + allow(ActiveFedora::SolrService).to receive(:get).with("id:#{fileset_ids[1]}", rows: 1).and_return('response' => { 'docs' => mock_records[0] }) allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{fileset_ids[1]}", rows: 1).and_return('response' => { 'docs' => mock_records[0] }) - allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_name}", {'df'=>'title_tesim', :rows=>1}).and_return('response' => { 'docs' => [] }) + allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_name}", {'df'=>'title_tesim', :rows=>1}).and_return('response' => { 'docs' => [{}] }) allow(Rails.logger).to receive(:warn) result = WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_ids[1]) expect(Rails.logger).to have_received(:warn).with("No admin set found with title_tesim: #{admin_set_name}.") diff --git a/spec/services/tasks/download_stats_migration_service_spec.rb b/spec/services/tasks/download_stats_migration_service_spec.rb index a617a2318..294ba1253 100644 --- a/spec/services/tasks/download_stats_migration_service_spec.rb +++ b/spec/services/tasks/download_stats_migration_service_spec.rb @@ -6,6 +6,27 @@ let(:mock_admin_set) { FactoryBot.create(:solr_query_result, :admin_set, title_tesim: [admin_set_title]) } let(:output_path) { Rails.root.join('tmp', 'download_migration_test_output.csv') } let(:service) { described_class.new } + let(:spec_base_analytics_url) { 'https://analytics-qa.lib.unc.edu' } + let(:spec_site_id) { '5' } + let(:spec_auth_token) { 'testtoken' } + let(:matomo_stats_migration_fixture) do + JSON.parse(File.read(File.join(Rails.root, '/spec/fixtures/files/matomo_stats_migration_fixture.json'))) + end + + around do |example| + # Set the environment variables for the test + @auth_token = ENV['MATOMO_AUTH_TOKEN'] + @site_id = ENV['MATOMO_SITE_ID'] + @matomo_base_url = ENV['MATOMO_BASE_URL'] + ENV['MATOMO_AUTH_TOKEN'] = spec_auth_token + ENV['MATOMO_SITE_ID'] = spec_site_id + ENV['MATOMO_BASE_URL'] = spec_base_analytics_url + example.run + # Reset the environment variables + ENV['MATOMO_AUTH_TOKEN'] = @auth_token + ENV['MATOMO_SITE_ID'] = @site_id + ENV['MATOMO_BASE_URL'] = @matomo_base_url + end before do allow(ActiveFedora::SolrService).to receive(:get).with("title_tesim:#{admin_set_title}", { :rows => 1, 'df' => 'title_tesim'}).and_return('response' => { 'docs' => [mock_admin_set] }) @@ -37,47 +58,33 @@ ]] } - # Create a hash of [fileset_id, date.beginning_of_month] => download count for each file_download_stats - let(:expected_aggregated_download_count) do - file_download_stats.flatten.each_with_object(Hash.new(0)) do |stat, hash| - hash[[stat.file_id, stat.date.beginning_of_month.to_datetime]] += stat.downloads - end - end - - let(:mock_works) do - file_download_stats.flatten.map do |stat| - FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) - end - end - describe '#list_work_stat_info' do - it 'writes all works to the output CSV file' do - file_download_stats.flatten.each_with_index do |stat, index| - allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] }) - end + # Loop through each source to test the listing of work stats + [Tasks::DownloadStatsMigrationService::DownloadMigrationSource::MATOMO, + Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE].each do |source| + context "when the source is #{source}" do + before do + test_setup_for(source) + end - expected_works = [ - { file_id: 'file_id_1', date: '2023-01-01 00:00:00 UTC', downloads: '10' }, - { file_id: 'file_id_1', date: '2023-03-01 00:00:00 UTC', downloads: '20' }, - { file_id: 'file_id_2', date: '2023-04-01 00:00:00 UTC', downloads: '50' }, - { file_id: 'file_id_2', date: '2023-05-01 00:00:00 UTC', downloads: '100' }, - { file_id: 'file_id_3', date: '2023-06-01 00:00:00 UTC', downloads: '200' }, - { file_id: 'file_id_3', date: '2023-07-01 00:00:00 UTC', downloads: '300' } - ] - service.list_work_stat_info(output_path, nil) - - expect(File).to exist(output_path) - expect(csv_to_hash_array(output_path)).to match_array(expected_works) - end + it 'writes all works to the output CSV file' do + expected_stats = setup_expected_stats_for(source) + list_work_stat_info_for(source) + expect(File).to exist(output_path) + expect(csv_to_hash_array(output_path)).to match_array(expected_stats) + end - it 'handles and logs errors' do - allow(Rails.logger).to receive(:error) - allow(FileDownloadStat).to receive(:all).and_raise(StandardError, 'Simulated database query failure') - service.list_work_stat_info(output_path, nil) - expect(Rails.logger).to have_received(:error).with('An error occurred while listing work stats: Simulated database query failure') + it 'handles and logs errors' do + allow(Rails.logger).to receive(:error) + allow(FileDownloadStat).to receive(:all).and_raise(StandardError, 'Simulated database query failure') + service.list_work_stat_info(output_path, nil, nil, Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE) + expect(Rails.logger).to have_received(:error).with('An error occurred while listing work stats: Simulated database query failure') + end + end end - context 'with an after_timestamp' do + # Excluded from the source loop since it focuses on the after_timestamp parameter + context 'with an after_timestamp (for cache migration only)' do let(:recent_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: '2023-05-05 00:00:00 UTC') } let(:old_stats) { FactoryBot.create_list(:file_download_stat, 3, updated_at: '2023-04-05 00:00:00 UTC') } let(:recent_stat_file_ids) { recent_stats.map(&:file_id) } @@ -93,74 +100,97 @@ end end - it 'filters works by the given timestamp' do - # Retrieve works created after 'updated_at' date for old stats - service.list_work_stat_info(output_path, '2023-04-06 00:00:00 UTC') - puts "CSV data: #{csv_to_hash_array(output_path).inspect}" - + it 'filters works by the given after_timestamp' do + service.list_work_stat_info(output_path, '2023-04-06 00:00:00 UTC', nil, Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE) expect(File).to exist(output_path) expect(csv_to_hash_array(output_path).map { |work| work[:file_id] }).to match_array(recent_stat_file_ids) expect(csv_to_hash_array(output_path).map { |work| work[:file_id] }).not_to include(*old_stat_file_ids) end end - end - describe '#migrate_to_new_table' do - before do - file_download_stats.flatten.each_with_index do |stat, index| - allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_works[index]] }) + context 'with an unsupported source' do + it 'handles and logs an error' do + allow(Rails.logger).to receive(:error) + service.list_work_stat_info(output_path, nil, nil, :unsupported_source) + expect(Rails.logger).to have_received(:error).with('An error occurred while listing work stats: Unsupported source: unsupported_source') end end + end - after { HycDownloadStat.delete_all } - - it 'creates new HycDownloadStat works from the CSV file' do - service.list_work_stat_info(output_path, nil) - service.migrate_to_new_table(output_path) - csv_to_hash_array(output_path).each_with_index do |csv_row, index| - work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(csv_row[:file_id]) - csv_row_date = Date.parse(csv_row[:date]).beginning_of_month - hyc_download_stat = HycDownloadStat.find_by(fileset_id: csv_row[:file_id], date: csv_row_date) - - expect(hyc_download_stat).to be_present - expect(hyc_download_stat.fileset_id).to eq(csv_row[:file_id]) - expect(hyc_download_stat.work_id).to eq(work_data[:work_id]) - expect(hyc_download_stat.date).to eq(csv_row[:date].to_date) - expect(hyc_download_stat.download_count).to eq(expected_aggregated_download_count[[csv_row[:file_id], csv_row_date]]) - end - end + describe '#migrate_to_new_table' do + # Loop through each source to test the listing of work stats + [Tasks::DownloadStatsMigrationService::DownloadMigrationSource::MATOMO, + # WIP: Implement later + # Tasks::DownloadStatsMigrationService::DownloadMigrationSource::GA4, + Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE].each do |source| + context "when the source is #{source}" do + before do + test_setup_for(source) + end + after { HycDownloadStat.delete_all } - it 'retains historic stats for a work even if the work cannot be found in solr' do - file_download_stats.flatten.each_with_index do |stat, index| - allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [] }) - end - service.list_work_stat_info(output_path, nil) - service.migrate_to_new_table(output_path) - csv_to_hash_array(output_path).each_with_index do |csv_row, index| - work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(csv_row[:file_id]) - csv_row_date = Date.parse(csv_row[:date]).beginning_of_month - hyc_download_stat = HycDownloadStat.find_by(fileset_id: csv_row[:file_id], date: csv_row_date) - - expect(hyc_download_stat).to be_present - expect(hyc_download_stat.fileset_id).to eq(csv_row[:file_id]) - expect(hyc_download_stat.work_id).to eq('Unknown') - expect(hyc_download_stat.admin_set_id).to eq('Unknown') - expect(hyc_download_stat.work_type).to eq('Unknown') - expect(hyc_download_stat.date).to eq(csv_row[:date].to_date) - expect(hyc_download_stat.download_count).to eq(expected_aggregated_download_count[[csv_row[:file_id], csv_row_date]]) - end - end + let (:expected_stats) { setup_expected_stats_for(source) } + + it 'creates new HycDownloadStat works from the CSV file' do + list_work_stat_info_for(source) + service.migrate_to_new_table(output_path) + csv_to_hash_array(output_path).each_with_index do |csv_row, index| + work_data = WorkUtilsHelper.fetch_work_data_by_fileset_id(csv_row[:file_id]) + csv_row_date = Date.parse(csv_row[:date]).beginning_of_month + hyc_download_stat = HycDownloadStat.find_by(fileset_id: csv_row[:file_id], date: csv_row_date) + + expect(hyc_download_stat).to be_present + expect(hyc_download_stat.fileset_id).to eq(csv_row[:file_id]) + expect(hyc_download_stat.work_id).to eq(work_data[:work_id] || 'Unknown') + expect(hyc_download_stat.date).to eq(csv_row[:date].to_date) + + # Verify the download count is correct + expected_work = expected_stats[index] + expected_download_count = expected_work[:downloads].to_i + expect(hyc_download_stat.download_count).to eq(expected_download_count) + end + end + + it 'retains historic stats for a work even if the work cannot be found in solr' do + # Mock the solr query to return a mostly empty response for each test file_set_id (1-6) + + (1..6).each do |index| + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:file_id_#{index}", rows: 1).and_return('response' => { 'docs' => [] }) + allow(ActiveFedora::SolrService).to receive(:get).with("id:file_id_#{index}", rows: 1).and_return('response' => { 'docs' => [] }) + end - it 'handles and logs errors' do - allow(CSV).to receive(:read).and_raise(StandardError, 'Simulated CSV read failure') - allow(Rails.logger).to receive(:error) - service.migrate_to_new_table(output_path) - expect(Rails.logger).to have_received(:error).with('An error occurred while migrating work stats: Simulated CSV read failure') + list_work_stat_info_for(source) + service.migrate_to_new_table(output_path) + csv_to_hash_array(output_path).each_with_index do |csv_row, index| + hyc_download_stat = HycDownloadStat.find_by(fileset_id: csv_row[:file_id], date: Date.parse(csv_row[:date]).beginning_of_month) + expect(hyc_download_stat).to be_present + expect(hyc_download_stat.fileset_id).to eq(csv_row[:file_id]) + expect(hyc_download_stat.work_id).to eq('Unknown') + expect(hyc_download_stat.admin_set_id).to eq('Unknown') + expect(hyc_download_stat.work_type).to eq('Unknown') + expect(hyc_download_stat.date).to eq(csv_row[:date].to_date) + + # Verify the download count is correct + expected_work = expected_stats[index] + expected_download_count = expected_work[:downloads].to_i + expect(hyc_download_stat.download_count).to eq(expected_download_count) + end + end + + it 'handles and logs errors' do + allow(CSV).to receive(:read).and_raise(StandardError, 'Simulated CSV read failure') + allow(Rails.logger).to receive(:error) + service.migrate_to_new_table(output_path) + expect(Rails.logger).to have_received(:error).with('An error occurred while migrating work stats: Simulated CSV read failure') + end + end end + # Excluding this portion of tests from the source loop as the error handling is the same for all sources context 'if a failure occurs during a private function' do before do - service.list_work_stat_info(output_path, nil) + test_setup_for(Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE) + service.list_work_stat_info(output_path, nil, nil, Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE) end it 'handles and logs errors from create_hyc_download_stat' do @@ -184,19 +214,92 @@ private + def test_setup_for(source) + case source + when Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE + when Tasks::DownloadStatsMigrationService::DownloadMigrationSource::MATOMO + # Mocking Matomo API responses based on the fixture data + matomo_stats_migration_fixture.each do |month, stats| + stub_request(:get, "#{ENV['MATOMO_BASE_URL']}/index.php") + .with(query: hash_including({ 'date' => month })) + .to_return(status: 200, body: stats.to_json, headers: { 'Content-Type' => 'application/json' }) + end + when Tasks::DownloadStatsMigrationService::DownloadMigrationSource::GA4 + else + raise ArgumentError, "Unsupported source: #{source}" + end + stub_solr_query_results_for(source) + end + + def stub_solr_query_results_for(source) + case source + when Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE + # Use mocked file_download_stats to create works for each file_set_id + mock_works = file_download_stats.flatten.map do |stat| + FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: [stat.file_id]) + end + # Mock query responses for each file_set_id with the corresponding work + file_download_stats.flatten.each_with_index do |stat, index| + mock_work = mock_works[index] + allow(ActiveFedora::SolrService).to receive(:get).with("id:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_work['id']] }) + + mock_work_with_admin_set = mock_work.dup + mock_work_with_admin_set['admin_set_tesim'] = [admin_set_title] + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:#{stat.file_id}", rows: 1).and_return('response' => { 'docs' => [mock_work_with_admin_set] }) + end + when Tasks::DownloadStatsMigrationService::DownloadMigrationSource::MATOMO + # Mock query responses for file_set_ids 1-6 + mock_works = (1..6).map do |index| + FactoryBot.create(:solr_query_result, :work, file_set_ids_ssim: ["file_id_#{index}"]) + end + (1..6).each do |index| + allow(ActiveFedora::SolrService).to receive(:get).with("id:file_id_#{index}", rows: 1).and_return('response' => { 'docs' => [mock_works[index - 1]] }) + allow(ActiveFedora::SolrService).to receive(:get).with("file_set_ids_ssim:file_id_#{index}", rows: 1).and_return('response' => { 'docs' => [mock_works[index - 1]] }) + end + when Tasks::DownloadStatsMigrationService::DownloadMigrationSource::GA4 + else + raise ArgumentError, "Unsupported source: #{source}" + end + end + + def setup_expected_stats_for(source) + expected_stats = [] + case source + when Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE + expected_stats = [ + { file_id: 'file_id_1', date: '2023-01-01 00:00:00 UTC', downloads: '10' }, + { file_id: 'file_id_1', date: '2023-03-01 00:00:00 UTC', downloads: '20' }, + { file_id: 'file_id_2', date: '2023-04-01 00:00:00 UTC', downloads: '50' }, + { file_id: 'file_id_2', date: '2023-05-01 00:00:00 UTC', downloads: '100' }, + { file_id: 'file_id_3', date: '2023-06-01 00:00:00 UTC', downloads: '200' }, + { file_id: 'file_id_3', date: '2023-07-01 00:00:00 UTC', downloads: '300' } + ] + when Tasks::DownloadStatsMigrationService::DownloadMigrationSource::MATOMO + expected_stats = [ + { file_id: 'file_id_1', date: '2024-01-01', downloads: '190' }, + { file_id: 'file_id_2', date: '2024-01-01', downloads: '150' }, + { file_id: 'file_id_3', date: '2024-02-01', downloads: '100' }, + { file_id: 'file_id_4', date: '2024-02-01', downloads: '80' }, + { file_id: 'file_id_5', date: '2024-03-01', downloads: '180' }, + { file_id: 'file_id_6', date: '2024-03-01', downloads: '550' } + ] + when Tasks::DownloadStatsMigrationService::DownloadMigrationSource::GA4 + end + expected_stats + end + def csv_to_hash_array(file_path) CSV.read(file_path, headers: true).map { |row| row.to_h.symbolize_keys } end - # Helper method to convert an array of FileDownloadStat objects to an array of hashes - # Checks for truncated date to the beginning of the month - def expected_works_for(stats) - stats.map do |stat| - { - file_id: stat.file_id, - date: stat.date.beginning_of_month.to_s, - downloads: stat.downloads.to_s, - } + # Execute the list_work_stat_info method for the given source with predefined timestamp parameters + def list_work_stat_info_for(source) + case source + when Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE + service.list_work_stat_info(output_path, nil, nil, Tasks::DownloadStatsMigrationService::DownloadMigrationSource::CACHE) + when Tasks::DownloadStatsMigrationService::DownloadMigrationSource::MATOMO + service.list_work_stat_info(output_path, '2024-01-01', '2024-03-01', Tasks::DownloadStatsMigrationService::DownloadMigrationSource::MATOMO) + when Tasks::DownloadStatsMigrationService::DownloadMigrationSource::GA4 end end end