Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HYC-1936 - Hyrax Cache Migration Utility #1117

Merged
merged 32 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c2a2dce
create rake task and service
davidcam-src Aug 19, 2024
fc8dbe3
tests for listing ids in the download stats migration service
davidcam-src Aug 19, 2024
e7a3562
update date range in factory
davidcam-src Aug 19, 2024
33bea2c
expand function to retrieve fields besides id, test refactor, enforce…
davidcam-src Aug 21, 2024
6a1b8ed
truncate by date, update or create stat function
davidcam-src Aug 21, 2024
fd91dfe
addressing grouping errors
davidcam-src Aug 21, 2024
143345c
check for truncated date in tests
davidcam-src Aug 21, 2024
cd78526
refactor functions in download analytics behavior into a helper module
davidcam-src Aug 22, 2024
345b910
linting, tests for utility helper and removing comments
davidcam-src Aug 22, 2024
7d5572d
factory for solr query results, adding functionality to migration ser…
davidcam-src Aug 23, 2024
b891812
helper functions for test class, handling truncation outside of query…
davidcam-src Aug 23, 2024
4f62fb6
cleaning up test file
davidcam-src Aug 23, 2024
14325ab
add task to rake file
davidcam-src Aug 23, 2024
22f809b
rubocop
davidcam-src Aug 23, 2024
b738beb
error message change
davidcam-src Aug 23, 2024
4f995e4
change csv check condition
davidcam-src Aug 23, 2024
71cf417
name change
davidcam-src Aug 23, 2024
862a793
logging and wording change
davidcam-src Aug 26, 2024
d335b72
rubocop
davidcam-src Aug 26, 2024
aa9bd55
phrasing change and logging
davidcam-src Aug 26, 2024
a914e06
slight logging change and moving csv writing into another function
davidcam-src Aug 26, 2024
59145a5
rubocop
davidcam-src Aug 26, 2024
df9279d
changed arguments for log progress function
davidcam-src Aug 26, 2024
27d54b7
progress tracking in migration service
davidcam-src Aug 26, 2024
50058c4
error handling
davidcam-src Aug 26, 2024
e9ea6dc
increment all_categories in progress tracker
davidcam-src Aug 26, 2024
115df42
tests
davidcam-src Aug 27, 2024
177dc77
remove comments
davidcam-src Aug 27, 2024
c2d339c
update test to avoid checking for the same value
davidcam-src Aug 28, 2024
c7b2b60
aggregate stats during retrieval
davidcam-src Aug 28, 2024
184e896
tests
davidcam-src Aug 28, 2024
66229d7
tests
davidcam-src Aug 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 8 additions & 35 deletions app/controllers/concerns/hyc/download_analytics_behavior.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ def track_download
send_image: '0',
ua: user_agent,
# Recovering work id with a solr query
dimension1: record_id,
dimension2: record_title
dimension1: work_data[:work_id],
dimension2: work_data[:title]
}
uri.query = URI.encode_www_form(uri_params)
response = HTTParty.get(uri.to_s)
Expand All @@ -58,19 +58,16 @@ def track_download
end

def create_download_stat
record_id_value = record_id
work_type_value = work_type
admin_set_id_value = admin_set_id
date = Date.today

Rails.logger.debug('Creating or updating hyc-download-stat database entry with the following attributes:')
Rails.logger.debug("fileset_id: #{fileset_id}, work_id: #{record_id_value}, admin_set_id: #{admin_set_id_value}, work_type: #{work_type_value}, date: #{date.beginning_of_month}")
Rails.logger.debug("fileset_id: #{fileset_id}, work_id: #{work_data[:work_id]}, admin_set_id: #{work_data[:admin_set_id]}, work_type: #{work_data[:work_type]}, date: #{date.beginning_of_month}")

stat = HycDownloadStat.find_or_initialize_by(
fileset_id: fileset_id,
work_id: record_id_value,
admin_set_id: admin_set_id_value,
work_type: work_type_value,
work_id: work_data[:work_id],
admin_set_id: work_data[:admin_set_id],
work_type: work_data[:work_type],
date: date.beginning_of_month
)
stat.download_count += 1
Expand All @@ -87,36 +84,12 @@ def bot_request?(user_agent)
browser.bot?
end

def fetch_record
@record ||= ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs']
end

def fetch_admin_set
@admin_set ||= ActiveFedora::SolrService.get("title_tesim:#{@admin_set_name}", rows: 1)['response']['docs']
end

def admin_set_id
@admin_set_id ||= fetch_admin_set.dig(0, 'id') || 'Unknown'
end

def record_id
@record_id ||= fetch_record.dig(0, 'id') || 'Unknown'
end

def work_type
@work_type ||= fetch_record.dig(0, 'has_model_ssim', 0) || 'Unknown'
end

def fileset_id
@fileset_id ||= params[:id] || 'Unknown'
end

def record_title
@record_title ||= if !fetch_record.blank? && fetch_record[0]['title_tesim']
fetch_record[0]['title_tesim'].first
else
'Unknown'
end
def work_data
@work_data ||= WorkUtilsHelper.fetch_work_data_by_fileset_id(fileset_id)
end

def site_id
Expand Down
18 changes: 18 additions & 0 deletions app/helpers/work_utils_helper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# frozen_string_literal: true
module WorkUtilsHelper
def self.fetch_work_data_by_fileset_id(fileset_id)
work = ActiveFedora::SolrService.get("file_set_ids_ssim:#{fileset_id}", rows: 1)['response']['docs'].first || {}
raise "No work found for fileset id: #{fileset_id}" if work.blank?
# Fetch the admin set related to the work
admin_set_name = work['admin_set_tesim']&.first || 'Unknown'
admin_set = ActiveFedora::SolrService.get("title_tesim:#{admin_set_name}", rows: 1)['response']['docs'].first || {}

{
work_id: work['id'] || 'Unknown',
work_type: work.dig('has_model_ssim', 0) || 'Unknown',
title: work['title_tesim']&.first || 'Unknown',
admin_set_id: admin_set['id'] || 'Unknown',
admin_set_name: admin_set_name
}
end
end
139 changes: 139 additions & 0 deletions app/services/tasks/download_stats_migration_service.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# frozen_string_literal: true
module Tasks
class DownloadStatsMigrationService
PAGE_SIZE = 1000
def list_work_stat_info(output_path, after_timestamp = nil)
begin
query = FileDownloadStat.all
query = query.where('updated_at > ?', after_timestamp) if after_timestamp.present?
total_work_stats = query.count
timestamp_clause = after_timestamp.present? ? "after specified time #{after_timestamp}" : 'without a timestamp'

# Log number of work stats retrieved and timestamp clause
Rails.logger.info("Listing #{total_work_stats} work stats #{timestamp_clause} to #{output_path} from the hyrax local cache.")

aggregated_data = {}
work_stats_retrieved_from_query_count = 0

Rails.logger.info('Retrieving work_stats from the database')
# Fetch the work_stats and aggregate them into monthly stats in Ruby, encountered issues with SQL queries
query.find_each(batch_size: PAGE_SIZE) do |stat|
truncated_date = stat.date.beginning_of_month
# Group the file_id and truncated date to be used as a key
key = [stat.file_id, truncated_date]
# Initialize the hash for the key if it doesn't exist
aggregated_data[key] ||= { file_id: stat.file_id, date: truncated_date, downloads: 0 }
# Sum the downloads for each key
aggregated_data[key][:downloads] += stat.downloads
work_stats_retrieved_from_query_count += 1
log_progress(work_stats_retrieved_from_query_count, total_work_stats)
end

aggregated_work_stats = aggregated_data.values
Rails.logger.info("Aggregated #{aggregated_work_stats.count} monthly stats from #{total_work_stats} daily stats")

# Write the work_stats to the specified CSV file
write_to_csv(output_path, aggregated_work_stats)
rescue StandardError => e
Rails.logger.error("An error occurred while listing work stats: #{e.message}")
Rails.logger.error(e.backtrace.join("\n"))
end
end

def migrate_to_new_table(csv_path)
begin
csv_data = CSV.read(csv_path, headers: true)
csv_data_stats = csv_data.map { |row| row.to_h.symbolize_keys }
progress_tracker = {
all_categories: 0,
created: 0,
updated: 0,
skipped: 0,
failed: 0
}

Rails.logger.info("Migrating #{csv_data_stats.count} work stats to the new table.")
# Recreate or update objects in new table
csv_data_stats.each do |stat|
create_hyc_download_stat(stat, progress_tracker)
progress_tracker[:all_categories] += 1
log_progress(progress_tracker[:all_categories], csv_data_stats.count, 'Migration')
end
Rails.logger.info("Migration complete: #{progress_tracker[:created]} created, #{progress_tracker[:updated]} updated, #{progress_tracker[:skipped]} skipped, #{progress_tracker[:failed]} failed")
rescue StandardError => e
Rails.logger.error("An error occurred while migrating work stats: #{e.message}")
Rails.logger.error(e.backtrace.join("\n"))
end
end

private

# Log progress at 25%, 50%, 75%, and 100%
def log_progress(work_stats_count, total_work_stats, process_type = 'Retrieval and Aggregation')
percentages = [0.25, 0.5, 0.75, 1.0]
log_intervals = percentages.map { |percent| (total_work_stats * percent).to_i }
if log_intervals.include?(work_stats_count)
percentage_done = percentages[log_intervals.index(work_stats_count)] * 100
Rails.logger.info("#{process_type} progress: #{percentage_done}% (#{work_stats_count}/#{total_work_stats} work_stats)")
end
end

def create_hyc_download_stat(stat, progress_tracker)
begin
hyc_download_stat = HycDownloadStat.find_or_initialize_by(
fileset_id: stat[:file_id].to_s,
date: stat[:date]
)
work_data = work_data_from_stat(stat)
hyc_download_stat.assign_attributes(
fileset_id: stat[:file_id],
work_id: work_data[:work_id],
admin_set_id: work_data[:admin_set_id],
work_type: work_data[:work_type],
date: stat[:date],
download_count: stat[:downloads],
)
rescue StandardError => e
Rails.logger.error("Failed to create HycDownloadStat for #{stat.inspect}: #{e.message}")
progress_tracker[:failed] += 1
end
save_hyc_download_stat(hyc_download_stat, stat, progress_tracker)
end

# Similar implementation to work_data in DownloadAnalyticsBehavior
# Memoization is not necessary here since this method is called per stat
def work_data_from_stat(stat)
WorkUtilsHelper.fetch_work_data_by_fileset_id(stat[:file_id])
end

# Method to write work stats to a CSV file
def write_to_csv(output_path, work_stats, headers = ['file_id', 'date', 'downloads'])
puts "Inspect work_stats: #{work_stats.inspect}"
CSV.open(output_path, 'w', write_headers: true, headers: headers) do |csv|
work_stats.each do |stat|
csv << [stat[:file_id], stat[:date], stat[:downloads]]
end
end
Rails.logger.info("Work stats successfully written to #{output_path}")
end

# Method to save the HycDownloadStat object and update the progress tracker
def save_hyc_download_stat(hyc_download_stat, stat, progress_tracker)
begin
if hyc_download_stat.new_record?
hyc_download_stat.save
progress_tracker[:created] += 1
elsif hyc_download_stat.changed?
hyc_download_stat.save
progress_tracker[:updated] += 1
else
progress_tracker[:skipped] += 1
end
rescue StandardError => e
Rails.logger.error("Error saving new row to HycDownloadStat: #{stat.inspect}: #{e.message}")
progress_tracker[:failed] += 1
end
end

end
end
54 changes: 54 additions & 0 deletions lib/tasks/migrate_download_stats.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# frozen_string_literal: true
require 'time'
require 'optparse'
require 'optparse/date'

namespace :migrate_download_stats do
desc 'output rows for download stat migration into a csv'
task :list_rows, [:output_dir, :after] => :environment do |_t, _args|
start_time = Time.now
puts "[#{start_time.utc.iso8601}] starting listing of work data"
options = {}

opts = OptionParser.new
opts.banner = 'Usage: bundle exec rake migrate_download_stats:list_rows -- [options]'
opts.on('-o', '--output-dir ARG', String, 'Directory list will be saved to') { |val| options[:output_dir] = val }
opts.on('-a', '--after ARG', String, 'List objects which have been updated after this timestamp') { |val| options[:after] = val }
args = opts.order!(ARGV) {}
opts.parse!(args)

unless options[:output_dir].present? && options[:output_dir].end_with?('.csv')
puts 'Please provide a valid output directory with a .csv extension. Got ' + options[:output_dir].to_s
exit 1
end

migration_service = Tasks::DownloadStatsMigrationService.new
old_stats_csv = migration_service.list_work_stat_info(options[:output_dir], options[:after])
puts "Listing completed in #{Time.now - start_time}s"
puts "Stored id list to file: #{options[:output_dir]}"
exit 0
end

desc 'migrate download stats to new table'
task :migrate, [:csv_path] => :environment do |_t, _args|
start_time = Time.now
puts "[#{start_time.utc.iso8601}] Starting migration from CSV to new table"
options = {}

opts = OptionParser.new
opts.banner = 'Usage: bundle exec rake migrate_download_stats:migrate -- [options]'
opts.on('-c', '--csv-path ARG', String, 'Path to the CSV file to migrate') { |val| options[:csv_path] = val }
args = opts.order!(ARGV) {}
opts.parse!(args)

unless options[:csv_path].present? && File.exist?(options[:csv_path])
puts 'Please provide a valid CSV file path'
exit 1
end

migration_service = Tasks::DownloadStatsMigrationService.new
migration_service.migrate_to_new_table(options[:csv_path])
puts "Migration completed in #{Time.now - start_time}s"
exit 0
end
end
73 changes: 17 additions & 56 deletions spec/controllers/hyrax/downloads_controller_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@
'admin_set_tesim' => ['Open_Access_Articles_and_Book_Chapters']}
]
}
let(:mock_work_data) { {
work_id: '1z40m031g',
work_type: 'Article',
title: ['Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response'],
admin_set_id: 'h128zk07m',
admin_set_name: 'Open_Access_Articles_and_Book_Chapters'
}
}
let(:file_set) do
FactoryBot.create(:file_with_work, user: user, content: File.open("#{fixture_path}/files/image.png"))
end
Expand All @@ -53,8 +61,7 @@
allow(stub_matomo)
@user = user
sign_in @user
allow(controller).to receive(:fetch_record).and_return(mock_record)
allow(controller).to receive(:fetch_admin_set).and_return(mock_admin_set)
allow(WorkUtilsHelper).to receive(:fetch_work_data_by_fileset_id).and_return(mock_work_data)
allow(Hyrax::Analytics.config).to receive(:site_id).and_return(spec_site_id)
allow(SecureRandom).to receive(:uuid).and_return('555')
allow(Hyrax::VirusCheckerService).to receive(:file_has_virus?) { false }
Expand Down Expand Up @@ -201,8 +208,14 @@

context 'fileset without a parent work' do
before do
allow(controller).to receive(:fetch_record).and_return([{}])
allow(controller).to receive(:fetch_admin_set).and_return([{}])
dummy_work_data = {
work_id: 'Unknown',
work_type: 'Unknown',
title: 'Unknown',
admin_set_id: 'Unknown',
admin_set_name: 'Unknown'
}
allow(WorkUtilsHelper).to receive(:fetch_work_data_by_fileset_id).and_return(dummy_work_data)
end

it 'records a download event with no work type' do
Expand Down Expand Up @@ -332,58 +345,6 @@
end
end

describe '#fetch_record' do
it 'fetches the record from Solr' do
expect(controller.send(:fetch_record)).to eq(mock_record)
end
end

describe '#fetch_admin_set' do
it 'fetches the admin set from Solr' do
expect(controller.send(:fetch_admin_set)).to eq(mock_admin_set)
end
end

describe '#admin_set_id' do
it 'returns the admin set id' do
expect(controller.send(:admin_set_id)).to eq('h128zk07m')
end
end

describe '#record_id' do
it 'returns the record id' do
expect(controller.send(:record_id)).to eq('1z40m031g')
end

it 'returns Unknown if the record is blank' do
allow(controller).to receive(:fetch_record).and_return([])
expect(controller.send(:record_id)).to eq('Unknown')
end
end

describe '#fileset_id' do
it 'returns the fileset id from params' do
controller.params = { id: file_set.id }
expect(controller.send(:fileset_id)).to eq(file_set.id)
end

it 'returns Unknown if params id is missing' do
controller.params = {}
expect(controller.send(:fileset_id)).to eq('Unknown')
end
end

describe '#record_title' do
it 'returns the record title' do
expect(controller.send(:record_title)).to eq('Key ethical issues discussed at CDC-sponsored international, regional meetings to explore cultural perspectives and contexts on pandemic influenza preparedness and response')
end

it 'returns Unknown if the record title is blank' do
allow(controller).to receive(:fetch_record).and_return([{ 'title_tesim' => nil }])
expect(controller.send(:record_title)).to eq('Unknown')
end
end

describe '#site_id' do
it 'returns the site id from ENV' do
expect(controller.send(:site_id)).to eq('5')
Expand Down
Loading
Loading