Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spike generate popular tasks using BigQuery #3761

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,6 @@
# vim swap files and tags
*.sw[a-z]
/tags

# Ignore local config
config/local_env.yml
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ gem "dalli"
gem "dartsass-rails"
gem "faraday"
gem "gds-api-adapters"
gem "google-cloud-bigquery"
gem "govspeak"
gem "govuk_ab_testing"
gem "govuk_app_config"
Expand Down
49 changes: 49 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ GEM
sass-embedded (~> 1.63)
date (3.3.4)
debug_inspector (1.2.0)
declarative (0.0.20)
diff-lcs (1.5.1)
dig_rb (1.0.1)
docile (1.4.0)
Expand Down Expand Up @@ -186,11 +187,41 @@ GEM
rest-client (~> 2.0)
globalid (1.2.1)
activesupport (>= 6.1)
google-apis-bigquery_v2 (0.70.0)
google-apis-core (>= 0.15.0, < 2.a)
google-apis-core (0.15.0)
addressable (~> 2.5, >= 2.5.1)
googleauth (~> 1.9)
httpclient (>= 2.8.1, < 3.a)
mini_mime (~> 1.0)
representable (~> 3.0)
retriable (>= 2.0, < 4.a)
rexml
google-cloud-bigquery (1.49.0)
concurrent-ruby (~> 1.0)
google-apis-bigquery_v2 (~> 0.62)
google-apis-core (~> 0.13)
google-cloud-core (~> 1.6)
googleauth (~> 1.9)
mini_mime (~> 1.0)
google-cloud-core (1.7.0)
google-cloud-env (>= 1.0, < 3.a)
google-cloud-errors (~> 1.0)
google-cloud-env (2.1.1)
faraday (>= 1.0, < 3.a)
google-cloud-errors (1.4.0)
google-protobuf (4.27.3)
bigdecimal
rake (>= 13)
googleapis-common-protos-types (1.15.0)
google-protobuf (>= 3.18, < 5.a)
googleauth (1.11.0)
faraday (>= 1.0, < 3.a)
google-cloud-env (~> 2.1)
jwt (>= 1.4, < 3.0)
multi_json (~> 1.11)
os (>= 0.9, < 2.0)
signet (>= 0.16, < 2.a)
govspeak (8.3.4)
actionview (>= 6)
addressable (>= 2.3.8, < 3)
Expand Down Expand Up @@ -245,6 +276,7 @@ GEM
csv
mini_mime (>= 1.0.0)
multi_xml (>= 0.5.2)
httpclient (2.8.3)
i18n (1.14.5)
concurrent-ruby (~> 1.0)
i18n-coverage (0.2.0)
Expand All @@ -265,6 +297,8 @@ GEM
json (2.7.2)
json-schema (4.3.0)
addressable (>= 2.8)
jwt (2.8.1)
base64
kramdown (2.4.0)
rexml
language_server-protocol (3.17.0.3)
Expand Down Expand Up @@ -293,6 +327,7 @@ GEM
mocha (2.4.5)
ruby2_keywords (>= 0.0.5)
msgpack (1.7.2)
multi_json (1.15.0)
multi_test (1.1.0)
multi_xml (0.7.1)
bigdecimal (~> 3.1)
Expand Down Expand Up @@ -514,6 +549,7 @@ GEM
opentelemetry-semantic_conventions
opentelemetry-semantic_conventions (1.10.1)
opentelemetry-api (~> 1.0)
os (1.1.4)
pact (1.65.1)
pact-mock_service (~> 3.0, >= 3.3.1)
pact-support (~> 1.16, >= 1.16.9)
Expand Down Expand Up @@ -622,13 +658,18 @@ GEM
regexp_parser (2.9.2)
reline (0.5.9)
io-console (~> 0.5)
representable (3.2.0)
declarative (< 0.1.0)
trailblazer-option (>= 0.1.1, < 0.2.0)
uber (< 0.2.0)
request_store (1.7.0)
rack (>= 1.4)
rest-client (2.1.0)
http-accept (>= 1.7.0, < 2.0)
http-cookie (>= 1.0.2, < 2.0)
mime-types (>= 1.16, < 4.0)
netrc (~> 0.8)
retriable (3.1.2)
rexml (3.3.5)
strscan
rinku (2.0.6)
Expand Down Expand Up @@ -705,6 +746,11 @@ GEM
sentry-ruby (5.18.2)
bigdecimal
concurrent-ruby (~> 1.0, >= 1.0.2)
signet (0.19.0)
addressable (~> 2.8)
faraday (>= 0.17.5, < 3.a)
jwt (>= 1.5, < 3.0)
multi_json (~> 1.10)
simplecov (0.22.0)
docile (~> 1.1)
simplecov-html (~> 0.11)
Expand Down Expand Up @@ -747,8 +793,10 @@ GEM
tins (1.33.0)
bigdecimal
sync
trailblazer-option (0.1.2)
tzinfo (2.0.6)
concurrent-ruby (~> 1.0)
uber (0.1.0)
unicode-display_width (2.5.0)
unparser (0.6.15)
diff-lcs (~> 1.3)
Expand Down Expand Up @@ -781,6 +829,7 @@ DEPENDENCIES
erb_lint
faraday
gds-api-adapters
google-cloud-bigquery
govspeak
govuk_ab_testing
govuk_app_config
Expand Down
46 changes: 26 additions & 20 deletions app/helpers/browse_helper.rb
Original file line number Diff line number Diff line change
@@ -1,27 +1,33 @@
module BrowseHelper
def display_popular_links_for_slug?(slug)
I18n.exists?(slug.to_s, scope: "browse.popular_links")
def slug(path = base_path)
path.sub(%r{.*(?=/browse/)}, "")
end

def display_popular_tasks_for_slug?(slug)
%w[benefits business].include?(slug)
end

def popular_links_for_slug(slug)
links = I18n.t(slug.to_s, scope: "browse.popular_links")
count = links.length
links.map.with_index(1) do |link, index|
{
text: link[:title],
href: link[:url],
data_attributes: {
module: "ga4-link-tracker",
ga4_track_links_only: "",
ga4_link: {
event_name: "navigation",
type: "action",
index_link: index,
index_total: count,
text: link[:title],
},
},
}
browse_page = slug(slug)

# Cache keys for the specific browse page
cache_key_latest = "popular_tasks_#{browse_page}_#{Date.yesterday.strftime("%Y-%m-%d")}"
cache_key_backup = "popular_tasks_backup_#{browse_page}"

# Try to fetch the latest cache first
popular_task_data = Rails.cache.read(cache_key_latest)

# If the latest cache doesn't exist, fall back to the backup cache
if popular_task_data.nil?
# Falling back to backup cache
popular_task_data = Rails.cache.read(cache_key_backup)
end

# If both caches are empty, fetch fresh data and cache it
if popular_task_data.nil?
popular_task_data = PopularTasks.new.fetch_data("/browse/#{browse_page}")
end

popular_task_data
end
end
25 changes: 25 additions & 0 deletions app/services/bigquery.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
require "google/cloud/bigquery"
require "googleauth"

class Bigquery
include Google::Auth

def self.build
new.build
end

def build
credentials = {
"client_email" => ENV["BIGQUERY_CLIENT_EMAIL"],
"private_key" => ENV["BIGQUERY_PRIVATE_KEY"],
}

Google::Cloud::Bigquery.new(
project_id: ENV["BIGQUERY_PROJECT"],
credentials: Google::Auth::ServiceAccountCredentials.make_creds(
json_key_io: StringIO.new(credentials.to_json),
scope: ["https://www.googleapis.com/auth/bigquery"],
),
)
end
end
71 changes: 71 additions & 0 deletions app/services/popular_tasks.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
class PopularTasks
CACHE_EXPIRATION = 24.hours # Set the cache expiration time
BACKUP_CACHE_EXPIRATION = 7.days # Backup cache can have a longer expiration
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to think a bit more about how this would work.

If the bigquery data was unavailable for more than 7 days then what happens?

I can think of other ways to do it - but this feels like a problem that must have been solved many times before. i.e. Only expire the cache if fresh data is available to fill it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've caught up now... the cache will expire regardless of whether or not the API responds so I understand the need for a backup. And I like the idea of writing to the backup at the same time as you fetch the fresh data.


def initialize; end

def client
@client ||= Bigquery.build
end

def fetch_data(browse_page, date: Date.yesterday)
@fetch_data = client
@date = date.strftime("%Y-%m-%d")

cache_key_latest = "popular_tasks_#{browse_page}_#{@date}"
cache_key_backup = "popular_tasks_backup_#{browse_page}"

Rails.cache.fetch(cache_key_latest, expires_in: CACHE_EXPIRATION) do
# If cache is empty, this block is executed
query = <<~SQL
WITH cte1 as (SELECT
event_date,
event_name,
search_term,
cleaned_page_location,
cleaned_page_referrer,
link_url,
count(event_name) as click,

FROM `ga4-analytics-352613.flattened_dataset.flattened_daily_ga_data_*`
WHERE _TABLE_SUFFIX = FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY))
-- WHERE _table_suffix IN ('20240708', '20240709','20240710','20240711','20240712','20240713','20240714')
group by 1,2,3,4,5,6),

CTE2 as (SELECT
event_date,
sum(click) as clicks,
cleaned_page_referrer as BrowsePage,
search_term,
ROW_NUMBER() OVER(PARTITION BY cleaned_page_referrer ORDER BY click DESC) Rank,
link_url as SearchDestPage
FROM cte1
WHERE event_name = 'select_item'
AND cleaned_page_referrer = '#{browse_page}'
AND cleaned_page_location = '/search/all'
group by click,event_date,cleaned_page_referrer,search_term,link_url
order by cleaned_page_referrer,Rank asc)

SELECT
*
FROM CTE2
WHERE Rank <6
SQL

data = @fetch_data.query(query).all
@results = data.map do |row|
{
url: row[:SearchDestPage], # Using SearchDestPage as the link URL
browse_page: row[:BrowsePage], # Using BrowsePage as the L1 browse
rank: row[:Rank], # Rank to order the links
}
end
@results.sort_by { |link| link[:rank] } # Order the links by their rank

# Cache the results in the backup cache as well
Rails.cache.write(cache_key_backup, @results, expires_in: BACKUP_CACHE_EXPIRATION)

@results
end
end
end
26 changes: 21 additions & 5 deletions app/views/browse/show.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
} %>
<% end %>

<% if display_popular_links_for_slug?(page.slug) %>
<% if display_popular_tasks_for_slug?(page.slug) %>
<div class="govuk-width-container">
<div class="govuk-grid-row">
<div class="govuk-grid-column-full">
Expand All @@ -45,9 +45,25 @@
font_size: "m"
} %>
<ul class="govuk-list govuk-!-margin-bottom-7">
<% popular_links_for_slug(page.slug).each do |link| %>
<% popular_links_for_slug(page.slug).each_with_index do |task, index| %>
<li>
<%= render partial: "shared/browse_action_link", locals: {link:} %>
<%= render "govuk_publishing_components/components/action_link", {
text: task[:url],
href: task[:url],
dark_large_icon: true,
margin_bottom: 3,
data_attributes: {
module: "ga4-link-tracker",
ga4_track_links_only: "",
ga4_link: {
event_name: "navigation",
type: "action",
index_link: index + 1 ,
index_total: popular_links_for_slug(page.slug).length,
text: task[:url]
}
}
} %>
</li>
<% end %>
</ul>
Expand All @@ -60,7 +76,7 @@
<% total_links = page.second_level_browse_pages.count.to_s %>
<%= render "shared/browse_cards_container" do %>
<%= render "govuk_publishing_components/components/cards", {
heading: display_popular_links_for_slug?(page.slug) ? t("browse.topics") : nil,
heading: display_popular_tasks_for_slug?(page.slug) ? t("browse.topics") : nil,
items: page.second_level_browse_pages.map.with_index do |second_level_browse_page, index|
{
link: {
Expand All @@ -78,6 +94,6 @@
description: second_level_browse_page.description,
}
end,
sub_heading_level: display_popular_links_for_slug?(page.slug) ? 3 : 2,
sub_heading_level: display_popular_tasks_for_slug?(page.slug) ? 3 : 2,
} %>
<% end %>
9 changes: 9 additions & 0 deletions config/environments/development.rb
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,13 @@

# Uncomment if you wish to allow Action Cable access from any origin.
# config.action_cable.disable_request_forgery_protection = true

config.before_configuration do
env_file = Rails.root.join("config/local_env.yml")
if File.exist?(env_file)
YAML.safe_load(File.open(env_file)).each do |key, value|
ENV[key.to_s] = value
end
end
end
end