-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
HYC-1936 - Hyrax Cache Migration Utility (#1117)
* create rake task and service * truncate by date, update or create stat function * refactor functions in download analytics behavior into a helper module * factory for solr query results, adding functionality to migration service * helper functions for test class, handling truncation outside of query to get around group by errors * aggregate stats during retrieval
- Loading branch information
1 parent
ef05a82
commit bf2925f
Showing
9 changed files
with
535 additions
and
91 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# frozen_string_literal: true | ||
module WorkUtilsHelper | ||
def self.fetch_work_data_by_fileset_id(fileset_id) | ||
work = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {} | ||
raise "No work found for fileset id: #{fileset_id}" if work.blank? | ||
# Fetch the admin set related to the work | ||
admin_set_name = work['admin_set_tesim']&.first || 'Unknown' | ||
admin_set = ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", rows: 1)['response']['docs'].first || {} | ||
|
||
{ | ||
work_id: work['id'] || 'Unknown', | ||
work_type: work.dig('has_model_ssim', 0) || 'Unknown', | ||
title: work['title_tesim']&.first || 'Unknown', | ||
admin_set_id: admin_set['id'] || 'Unknown', | ||
admin_set_name: admin_set_name | ||
} | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
# frozen_string_literal: true | ||
module Tasks | ||
class DownloadStatsMigrationService | ||
PAGE_SIZE = 1000 | ||
def list_work_stat_info(output_path, after_timestamp = nil) | ||
begin | ||
query = FileDownloadStat.all | ||
query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present? | ||
total_work_stats = query.count | ||
timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp' | ||
|
||
# Log number of work stats retrieved and timestamp clause | ||
Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.") | ||
|
||
aggregated_data = {} | ||
work_stats_retrieved_from_query_count = 0 | ||
|
||
Rails.logger.info('Retrieving work_stats from the database') | ||
# Fetch the work_stats and aggregate them into monthly stats in Ruby, encountered issues with SQL queries | ||
query.find_each(batch_size: PAGE_SIZE) do |stat| | ||
truncated_date = stat.date.beginning_of_month | ||
# Group the file_id and truncated date to be used as a key | ||
key = [stat.file_id, truncated_date] | ||
# Initialize the hash for the key if it doesn't exist | ||
aggregated_data[key] ||= { file_id: stat.file_id, date: truncated_date, downloads: 0 } | ||
# Sum the downloads for each key | ||
aggregated_data[key][:downloads] += stat.downloads | ||
work_stats_retrieved_from_query_count += 1 | ||
log_progress(work_stats_retrieved_from_query_count, total_work_stats) | ||
end | ||
|
||
aggregated_work_stats = aggregated_data.values | ||
Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{total_work_stats} daily stats") | ||
|
||
# Write the work_stats to the specified CSV file | ||
write_to_csv(output_path, aggregated_work_stats) | ||
rescue StandardError => e | ||
Rails.logger.error("An error occurred while listing work stats: #{e.message}") | ||
Rails.logger.error(e.backtrace.join("\n")) | ||
end | ||
end | ||
|
||
def migrate_to_new_table(csv_path) | ||
begin | ||
csv_data = CSV.read(csv_path, headers: true) | ||
csv_data_stats = csv_data.map { |row| row.to_h.symbolize_keys } | ||
progress_tracker = { | ||
all_categories: 0, | ||
created: 0, | ||
updated: 0, | ||
skipped: 0, | ||
failed: 0 | ||
} | ||
|
||
Rails.logger.info("Migrating #{csv_data_stats.count} work stats to the new table.") | ||
# Recreate or update objects in new table | ||
csv_data_stats.each do |stat| | ||
create_hyc_download_stat(stat, progress_tracker) | ||
progress_tracker[:all_categories] += 1 | ||
log_progress(progress_tracker[:all_categories], csv_data_stats.count, 'Migration') | ||
end | ||
Rails.logger.info("Migration complete: #{progress_tracker[:created]} created, #{progress_tracker[:updated]} updated, #{progress_tracker[:skipped]} skipped, #{progress_tracker[:failed]} failed") | ||
rescue StandardError => e | ||
Rails.logger.error("An error occurred while migrating work stats: #{e.message}") | ||
Rails.logger.error(e.backtrace.join("\n")) | ||
end | ||
end | ||
|
||
private | ||
|
||
# Log progress at 25%, 50%, 75%, and 100% | ||
def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval and Aggregation') | ||
percentages = [0.25, 0.5, 0.75, 1.0] | ||
log_intervals = percentages.map { |percent| (total_work_stats * percent).to_i } | ||
if log_intervals.include?(work_stats_count) | ||
percentage_done = percentages[log_intervals.index(work_stats_count)] * 100 | ||
Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{work_stats_count}/#{total_work_stats} work_stats)") | ||
end | ||
end | ||
|
||
def create_hyc_download_stat(stat, progress_tracker) | ||
begin | ||
hyc_download_stat = HycDownloadStat.find_or_initialize_by( | ||
fileset_id: stat[:file_id].to_s, | ||
date: stat[:date] | ||
) | ||
work_data = work_data_from_stat(stat) | ||
hyc_download_stat.assign_attributes( | ||
fileset_id: stat[:file_id], | ||
work_id: work_data[:work_id], | ||
admin_set_id: work_data[:admin_set_id], | ||
work_type: work_data[:work_type], | ||
date: stat[:date], | ||
download_count: stat[:downloads], | ||
) | ||
rescue StandardError => e | ||
Rails.logger.error("Failed to create HycDownloadStat for #{stat.inspect}: #{e.message}") | ||
progress_tracker[:failed] += 1 | ||
end | ||
save_hyc_download_stat(hyc_download_stat, stat, progress_tracker) | ||
end | ||
|
||
# Similar implementation to work_data in DownloadAnalyticsBehavior | ||
# Memoization is not necessary here since this method is called per stat | ||
def work_data_from_stat(stat) | ||
WorkUtilsHelper.fetch_work_data_by_fileset_id(stat[:file_id]) | ||
end | ||
|
||
# Method to write work stats to a CSV file | ||
def write_to_csv(output_path, work_stats, headers = ['file_id', 'date', 'downloads']) | ||
puts "Inspect work_stats: #{work_stats.inspect}" | ||
CSV.open(output_path, 'w', write_headers: true, headers: headers) do |csv| | ||
work_stats.each do |stat| | ||
csv << [stat[:file_id], stat[:date], stat[:downloads]] | ||
end | ||
end | ||
Rails.logger.info("Work stats successfully written to #{output_path}") | ||
end | ||
|
||
# Method to save the HycDownloadStat object and update the progress tracker | ||
def save_hyc_download_stat(hyc_download_stat, stat, progress_tracker) | ||
begin | ||
if hyc_download_stat.new_record? | ||
hyc_download_stat.save | ||
progress_tracker[:created] += 1 | ||
elsif hyc_download_stat.changed? | ||
hyc_download_stat.save | ||
progress_tracker[:updated] += 1 | ||
else | ||
progress_tracker[:skipped] += 1 | ||
end | ||
rescue StandardError => e | ||
Rails.logger.error("Error saving new row to HycDownloadStat: #{stat.inspect}: #{e.message}") | ||
progress_tracker[:failed] += 1 | ||
end | ||
end | ||
|
||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# frozen_string_literal: true | ||
require 'time' | ||
require 'optparse' | ||
require 'optparse/date' | ||
|
||
namespace :migrate_download_stats do | ||
desc 'output rows for download stat migration into a csv' | ||
task :list_rows, [:output_dir, :after] => :environment do |_t, _args| | ||
start_time = Time.now | ||
puts "[#{start_time.utc.iso8601}] starting listing of work data" | ||
options = {} | ||
|
||
opts = OptionParser.new | ||
opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_rows -- [options]' | ||
opts.on('-o', '--output-dir ARG', String, 'Directory list will be saved to') { |val| options[:output_dir] = val } | ||
opts.on('-a', '--after ARG', String, 'List objects which have been updated after this timestamp') { |val| options[:after] = val } | ||
args = opts.order!(ARGV) {} | ||
opts.parse!(args) | ||
|
||
unless options[:output_dir].present? && options[:output_dir].end_with?('.csv') | ||
puts 'Please provide a valid output directory with a .csv extension. Got ' + options[:output_dir].to_s | ||
exit 1 | ||
end | ||
|
||
migration_service = Tasks::DownloadStatsMigrationService.new | ||
old_stats_csv = migration_service.list_work_stat_info(options[:output_dir], options[:after]) | ||
puts "Listing completed in #{Time.now - start_time}s" | ||
puts "Stored id list to file: #{options[:output_dir]}" | ||
exit 0 | ||
end | ||
|
||
desc 'migrate download stats to new table' | ||
task :migrate, [:csv_path] => :environment do |_t, _args| | ||
start_time = Time.now | ||
puts "[#{start_time.utc.iso8601}] Starting migration from CSV to new table" | ||
options = {} | ||
|
||
opts = OptionParser.new | ||
opts.banner = 'Usage: bundle exec rake migrate_download_stats:migrate -- [options]' | ||
opts.on('-c', '--csv-path ARG', String, 'Path to the CSV file to migrate') { |val| options[:csv_path] = val } | ||
args = opts.order!(ARGV) {} | ||
opts.parse!(args) | ||
|
||
unless options[:csv_path].present? && File.exist?(options[:csv_path]) | ||
puts 'Please provide a valid CSV file path' | ||
exit 1 | ||
end | ||
|
||
migration_service = Tasks::DownloadStatsMigrationService.new | ||
migration_service.migrate_to_new_table(options[:csv_path]) | ||
puts "Migration completed in #{Time.now - start_time}s" | ||
exit 0 | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.