Skip to content

Commit

Permalink
HYC-1936 - Hyrax Cache Migration Utility (#1117)
Browse files Browse the repository at this point in the history
* create rake task and service

* truncate by date, update or create stat function

* refactor functions in download analytics behavior into a helper module

* factory for solr query results, adding functionality to migration service

* helper functions for test class, handling truncation outside of query to get around group by errors

* aggregate stats during retrieval
  • Loading branch information
davidcam-src authored Aug 29, 2024
1 parent ef05a82 commit bf2925f
Show file tree
Hide file tree
Showing 9 changed files with 535 additions and 91 deletions.
43 changes: 8 additions & 35 deletions app/controllers/concerns/hyc/download_analytics_behavior.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ def track_download
send_image: '0',
ua: user_agent,
# Recovering work id with a solr query
dimension1: record_id,
dimension2: record_title
dimension1: work_data[:work_id],
dimension2: work_data[:title]
}
uri.query = URI.encode_www_form(uri_params)
response = HTTParty.get(uri.to_s)
Expand All @@ -58,19 +58,16 @@ def track_download
end

def create_download_stat
record_id_value = record_id
work_type_value = work_type
admin_set_id_value = admin_set_id
date = Date.today

Rails.logger.debug('Creating or updating hyc-download-stat database entry with the following attributes:')
Rails.logger.debug("fileset_id: #{fileset_id}, work_id: #{record_id_value}, admin_set_id: #{admin_set_id_value}, work_type: #{work_type_value}, date: #{date.beginning_of_month}")
Rails.logger.debug("fileset_id: #{fileset_id}, work_id: #{work_data[:work_id]}, admin_set_id: #{work_data[:admin_set_id]}, work_type: #{work_data[:work_type]}, date: #{date.beginning_of_month}")

stat = HycDownloadStat.find_or_initialize_by(
fileset_id: fileset_id,
work_id: record_id_value,
admin_set_id: admin_set_id_value,
work_type: work_type_value,
work_id: work_data[:work_id],
admin_set_id: work_data[:admin_set_id],
work_type: work_data[:work_type],
date: date.beginning_of_month
)
stat.download_count += 1
Expand All @@ -87,36 +84,12 @@ def bot_request?(user_agent)
browser.bot?
end

def fetch_record
@record ||= ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs']
end

def fetch_admin_set
@admin_set ||= ActiveFedora::SolrService.get("title_tesim:#{@admin_set_name}", rows: 1)['response']['docs']
end

def admin_set_id
@admin_set_id ||= fetch_admin_set.dig(0, 'id') || 'Unknown'
end

def record_id
@record_id ||= fetch_record.dig(0, 'id') || 'Unknown'
end

def work_type
@work_type ||= fetch_record.dig(0, 'has_model_ssim', 0) || 'Unknown'
end

def fileset_id
@fileset_id ||= params[:id] || 'Unknown'
end

def record_title
@record_title ||= if !fetch_record.blank? && fetch_record[0]['title_tesim']
fetch_record[0]['title_tesim'].first
else
'Unknown'
end
def work_data
@work_data ||= WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id)
end

def site_id
Expand Down
18 changes: 18 additions & 0 deletions app/helpers/work_utils_helper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# frozen_string_literal: true
module WorkUtilsHelper
def self.fetch_work_data_by_fileset_id(fileset_id)
work = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {}
raise "No work found for fileset id: #{fileset_id}" if work.blank?
# Fetch the admin set related to the work
admin_set_name = work['admin_set_tesim']&.first || 'Unknown'
admin_set = ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", rows: 1)['response']['docs'].first || {}

{
work_id: work['id'] || 'Unknown',
work_type: work.dig('has_model_ssim', 0) || 'Unknown',
title: work['title_tesim']&.first || 'Unknown',
admin_set_id: admin_set['id'] || 'Unknown',
admin_set_name: admin_set_name
}
end
end
139 changes: 139 additions & 0 deletions app/services/tasks/download_stats_migration_service.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# frozen_string_literal: true
module Tasks
class DownloadStatsMigrationService
PAGE_SIZE = 1000
def list_work_stat_info(output_path, after_timestamp = nil)
begin
query = FileDownloadStat.all
query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
total_work_stats = query.count
timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp'

# Log number of work stats retrieved and timestamp clause
Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.")

aggregated_data = {}
work_stats_retrieved_from_query_count = 0

Rails.logger.info('Retrieving work_stats from the database')
# Fetch the work_stats and aggregate them into monthly stats in Ruby, encountered issues with SQL queries
query.find_each(batch_size: PAGE_SIZE) do |stat|
truncated_date = stat.date.beginning_of_month
# Group the file_id and truncated date to be used as a key
key = [stat.file_id, truncated_date]
# Initialize the hash for the key if it doesn't exist
aggregated_data[key] ||= { file_id: stat.file_id, date: truncated_date, downloads: 0 }
# Sum the downloads for each key
aggregated_data[key][:downloads] += stat.downloads
work_stats_retrieved_from_query_count += 1
log_progress(work_stats_retrieved_from_query_count, total_work_stats)
end

aggregated_work_stats = aggregated_data.values
Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{total_work_stats} daily stats")

# Write the work_stats to the specified CSV file
write_to_csv(output_path, aggregated_work_stats)
rescue StandardError => e
Rails.logger.error("An error occurred while listing work stats: #{e.message}")
Rails.logger.error(e.backtrace.join("\n"))
end
end

def migrate_to_new_table(csv_path)
begin
csv_data = CSV.read(csv_path, headers: true)
csv_data_stats = csv_data.map { |row| row.to_h.symbolize_keys }
progress_tracker = {
all_categories: 0,
created: 0,
updated: 0,
skipped: 0,
failed: 0
}

Rails.logger.info("Migrating #{csv_data_stats.count} work stats to the new table.")
# Recreate or update objects in new table
csv_data_stats.each do |stat|
create_hyc_download_stat(stat, progress_tracker)
progress_tracker[:all_categories] += 1
log_progress(progress_tracker[:all_categories], csv_data_stats.count, 'Migration')
end
Rails.logger.info("Migration complete: #{progress_tracker[:created]} created, #{progress_tracker[:updated]} updated, #{progress_tracker[:skipped]} skipped, #{progress_tracker[:failed]} failed")
rescue StandardError => e
Rails.logger.error("An error occurred while migrating work stats: #{e.message}")
Rails.logger.error(e.backtrace.join("\n"))
end
end

private

# Log progress at 25%, 50%, 75%, and 100%
def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval and Aggregation')
percentages = [0.25, 0.5, 0.75, 1.0]
log_intervals = percentages.map { |percent| (total_work_stats * percent).to_i }
if log_intervals.include?(work_stats_count)
percentage_done = percentages[log_intervals.index(work_stats_count)] * 100
Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{work_stats_count}/#{total_work_stats} work_stats)")
end
end

def create_hyc_download_stat(stat, progress_tracker)
begin
hyc_download_stat = HycDownloadStat.find_or_initialize_by(
fileset_id: stat[:file_id].to_s,
date: stat[:date]
)
work_data = work_data_from_stat(stat)
hyc_download_stat.assign_attributes(
fileset_id: stat[:file_id],
work_id: work_data[:work_id],
admin_set_id: work_data[:admin_set_id],
work_type: work_data[:work_type],
date: stat[:date],
download_count: stat[:downloads],
)
rescue StandardError => e
Rails.logger.error("Failed to create HycDownloadStat for #{stat.inspect}: #{e.message}")
progress_tracker[:failed] += 1
end
save_hyc_download_stat(hyc_download_stat, stat, progress_tracker)
end

# Similar implementation to work_data in DownloadAnalyticsBehavior
# Memoization is not necessary here since this method is called per stat
def work_data_from_stat(stat)
WorkUtilsHelper.fetch_work_data_by_fileset_id(stat[:file_id])
end

# Method to write work stats to a CSV file
def write_to_csv(output_path, work_stats, headers = ['file_id', 'date', 'downloads'])
puts "Inspect work_stats: #{work_stats.inspect}"
CSV.open(output_path, 'w', write_headers: true, headers: headers) do |csv|
work_stats.each do |stat|
csv << [stat[:file_id], stat[:date], stat[:downloads]]
end
end
Rails.logger.info("Work stats successfully written to #{output_path}")
end

# Method to save the HycDownloadStat object and update the progress tracker
def save_hyc_download_stat(hyc_download_stat, stat, progress_tracker)
begin
if hyc_download_stat.new_record?
hyc_download_stat.save
progress_tracker[:created] += 1
elsif hyc_download_stat.changed?
hyc_download_stat.save
progress_tracker[:updated] += 1
else
progress_tracker[:skipped] += 1
end
rescue StandardError => e
Rails.logger.error("Error saving new row to HycDownloadStat: #{stat.inspect}: #{e.message}")
progress_tracker[:failed] += 1
end
end

end
end
54 changes: 54 additions & 0 deletions lib/tasks/migrate_download_stats.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# frozen_string_literal: true
require 'time'
require 'optparse'
require 'optparse/date'

namespace :migrate_download_stats do
desc 'output rows for download stat migration into a csv'
task :list_rows, [:output_dir, :after] => :environment do |_t, _args|
start_time = Time.now
puts "[#{start_time.utc.iso8601}] starting listing of work data"
options = {}

opts = OptionParser.new
opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_rows -- [options]'
opts.on('-o', '--output-dir ARG', String, 'Directory list will be saved to') { |val| options[:output_dir] = val }
opts.on('-a', '--after ARG', String, 'List objects which have been updated after this timestamp') { |val| options[:after] = val }
args = opts.order!(ARGV) {}
opts.parse!(args)

unless options[:output_dir].present? && options[:output_dir].end_with?('.csv')
puts 'Please provide a valid output directory with a .csv extension. Got ' + options[:output_dir].to_s
exit 1
end

migration_service = Tasks::DownloadStatsMigrationService.new
old_stats_csv = migration_service.list_work_stat_info(options[:output_dir], options[:after])
puts "Listing completed in #{Time.now - start_time}s"
puts "Stored id list to file: #{options[:output_dir]}"
exit 0
end

desc 'migrate download stats to new table'
task :migrate, [:csv_path] => :environment do |_t, _args|
start_time = Time.now
puts "[#{start_time.utc.iso8601}] Starting migration from CSV to new table"
options = {}

opts = OptionParser.new
opts.banner = 'Usage: bundle exec rake migrate_download_stats:migrate -- [options]'
opts.on('-c', '--csv-path ARG', String, 'Path to the CSV file to migrate') { |val| options[:csv_path] = val }
args = opts.order!(ARGV) {}
opts.parse!(args)

unless options[:csv_path].present? && File.exist?(options[:csv_path])
puts 'Please provide a valid CSV file path'
exit 1
end

migration_service = Tasks::DownloadStatsMigrationService.new
migration_service.migrate_to_new_table(options[:csv_path])
puts "Migration completed in #{Time.now - start_time}s"
exit 0
end
end
73 changes: 17 additions & 56 deletions spec/controllers/hyrax/downloads_controller_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@
'admin_set_tesim' => ['Open_Access_Articles_and_Book_Chapters']}
]
}
let(:mock_work_data) { {
work_id: '1z40m031g',
work_type: 'Article',
title: ['Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response'],
admin_set_id: 'h128zk07m',
admin_set_name: 'Open_Access_Articles_and_Book_Chapters'
}
}
let(:file_set) do
FactoryBot.create(:file_with_work, user: user, content: File.open("#{fixture_path}/files/image.png"))
end
Expand All @@ -53,8 +61,7 @@
allow(stub_matomo)
@user = user
sign_in @user
allow(controller).to receive(:fetch_record).and_return(mock_record)
allow(controller).to receive(:fetch_admin_set).and_return(mock_admin_set)
allow(WorkUtilsHelper).to receive(:fetch_work_data_by_fileset_id).and_return(mock_work_data)
allow(Hyrax::Analytics.config).to receive(:site_id).and_return(spec_site_id)
allow(SecureRandom).to receive(:uuid).and_return('555')
allow(Hyrax::VirusCheckerService).to receive(:file_has_virus?) { false }
Expand Down Expand Up @@ -201,8 +208,14 @@

context 'fileset without a parent work' do
before do
allow(controller).to receive(:fetch_record).and_return([{}])
allow(controller).to receive(:fetch_admin_set).and_return([{}])
dummy_work_data = {
work_id: 'Unknown',
work_type: 'Unknown',
title: 'Unknown',
admin_set_id: 'Unknown',
admin_set_name: 'Unknown'
}
allow(WorkUtilsHelper).to receive(:fetch_work_data_by_fileset_id).and_return(dummy_work_data)
end

it 'records a download event with no work type' do
Expand Down Expand Up @@ -332,58 +345,6 @@
end
end

describe '#fetch_record' do
it 'fetches the record from Solr' do
expect(controller.send(:fetch_record)).to eq(mock_record)
end
end

describe '#fetch_admin_set' do
it 'fetches the admin set from Solr' do
expect(controller.send(:fetch_admin_set)).to eq(mock_admin_set)
end
end

describe '#admin_set_id' do
it 'returns the admin set id' do
expect(controller.send(:admin_set_id)).to eq('h128zk07m')
end
end

describe '#record_id' do
it 'returns the record id' do
expect(controller.send(:record_id)).to eq('1z40m031g')
end

it 'returns Unknown if the record is blank' do
allow(controller).to receive(:fetch_record).and_return([])
expect(controller.send(:record_id)).to eq('Unknown')
end
end

describe '#fileset_id' do
it 'returns the fileset id from params' do
controller.params = { id: file_set.id }
expect(controller.send(:fileset_id)).to eq(file_set.id)
end

it 'returns Unknown if params id is missing' do
controller.params = {}
expect(controller.send(:fileset_id)).to eq('Unknown')
end
end

describe '#record_title' do
it 'returns the record title' do
expect(controller.send(:record_title)).to eq('Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response')
end

it 'returns Unknown if the record title is blank' do
allow(controller).to receive(:fetch_record).and_return([{ 'title_tesim' => nil }])
expect(controller.send(:record_title)).to eq('Unknown')
end
end

describe '#site_id' do
it 'returns the site id from ENV' do
expect(controller.send(:site_id)).to eq('5')
Expand Down
Loading

0 comments on commit bf2925f

Please sign in to comment.