From 434462e4cb8c279780d48e0f2bb6721cfa6f424c Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Sat, 6 Jan 2024 16:35:24 +0200 Subject: [PATCH] Restore citeseerx special casing Partially reverts ddab25a5ee71e2f23fe4b8dfb5a28c8da333a922 Bug: T354471 --- src/oabot/main.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/oabot/main.py b/src/oabot/main.py index 4738d6b..2226a7a 100644 --- a/src/oabot/main.py +++ b/src/oabot/main.py @@ -391,10 +391,17 @@ def get_oa_link(paper, doi=None, only_unpaywall=True): return False, oa_status for oa_location in resp.get('oa_locations') or []: + landing_page = oa_location.get('url_for_landing_page', '') # In case there's a handle, prefer the landing page URL over the PDF link # as the hdl URL will be converted to the hdl parameter. - if 'hdl.handle.net' in oa_location.get('url_for_landing_page', ''): - candidate_urls.append(oa_location.get('url_for_landing_page')) + if 'hdl.handle.net' in landing_page: + candidate_urls.append(landing_page) + # T354471: If the URL comes from CiteSeerX, use the landing page URL + # so that other arxiv/identifier matches have a chance to rank higher + # and override any incorrect matches by title on the CiteSeerX side. + if 'citeseerx.ist.psu.edu' in landing_page: + candidate_urls.append(landing_page.replace("/summary", "/download")) + if oa_location.get('url') and oa_location.get('host_type') != 'publisher': candidate_urls.append(oa_location['url'])