From 2fcd611005836ae6d3b060664cd576d466329498 Mon Sep 17 00:00:00 2001 From: Kristian Aune Date: Wed, 6 Sep 2023 13:22:32 +0200 Subject: [PATCH 1/2] Harmonize feed scripts --- .github/workflows/feed.yml | 37 ++++++++++++++++++++----- feed_to_vespa.py | 55 +++++++++++++++++++------------------- 2 files changed, 58 insertions(+), 34 deletions(-) diff --git a/.github/workflows/feed.yml b/.github/workflows/feed.yml index 6a7fdc90..eead7420 100644 --- a/.github/workflows/feed.yml +++ b/.github/workflows/feed.yml @@ -4,25 +4,23 @@ on: branches: [ master ] env: - DATA_PLANE_PUBLIC_KEY : ${{ secrets.VESPA_TEAM_DATA_PLANE_PUBLIC_CERT }} - DATA_PLANE_PRIVATE_KEY: ${{ secrets.VESPA_TEAM_DATA_PLANE_PRIVATE_KEY }} + DATA_PLANE_PUBLIC_KEY : ${{ secrets.VESPA_TEAM_DATA_PLANE_PUBLIC_CERT }} + DATA_PLANE_PRIVATE_KEY : ${{ secrets.VESPA_TEAM_DATA_PLANE_PRIVATE_KEY }} + VESPA_CLI_DATA_PLANE_CERT : ${{ secrets.VESPA_TEAM_VESPA_CLI_DATA_PLANE_CERT }} + VESPA_CLI_DATA_PLANE_KEY : ${{ secrets.VESPA_TEAM_VESPA_CLI_DATA_PLANE_KEY }} jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: ruby-version: 3.1 bundler-cache: true - - uses: actions/setup-python@v4 - with: - python-version: '3.x' - - name: Install Sphinx run: | sudo apt-get install -y pandoc @@ -37,19 +35,44 @@ jobs: sed -i.orig 's/{%/{ %/g; s/%}/% }/g; s/{{/{ {/g; s/}}/} }/g' docs/sphinx/build/*.html ; rm docs/sphinx/build/*.orig bundle exec jekyll build -s docs/sphinx/build/ -p _plugins-vespafeed --config _config.yml + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + - name: Install dependencies run: | pip3 install PyYAML spacy mmh3 requests html5lib beautifulsoup4 markdownify tiktoken + - name: Get Vespa CLI - update to later versions as needed + run: | + apt update && apt -y install curl + curl -SsLo vespa-cli.tar.gz https://github.com/vespa-engine/vespa/releases/download/v8.209.11/vespa-cli_8.209.11_linux_amd64.tar.gz + tar -xvf vespa-cli.tar.gz && ln -fs vespa-cli_8.209.11_linux_amd64/bin/vespa + - name: Feed site run: | + # The python scripts below uses the Vespa CLI for feeding / data access. + # See https://docs.vespa.ai/en/vespa-cli.html. + # The environment variables below have credentials for endpoint access - + # use the key/cert files in .vespa and paste their content into GitHub Secrets. + # VESPA_CLI_API_KEY must be set and empty as below. + export VESPA_CLI_DATA_PLANE_CERT + export VESPA_CLI_DATA_PLANE_KEY + export VESPA_CLI_API_KEY= ./feed_to_vespa.py _config.yml - name: Feed paragraphs site run: | + export VESPA_CLI_DATA_PLANE_CERT + export VESPA_CLI_DATA_PLANE_KEY + export VESPA_CLI_API_KEY= ./feed-split.py pyvespa_index.json https://pyvespa.readthedocs.io questions.jsonl ./feed_to_vespa.py _paragraphs_config.yml - name: Feed suggestions run: | + export VESPA_CLI_DATA_PLANE_CERT + export VESPA_CLI_DATA_PLANE_KEY + export VESPA_CLI_API_KEY= ./feed_to_vespa.py _suggestions_config.yml diff --git a/feed_to_vespa.py b/feed_to_vespa.py index d9a28aed..c5029c28 100755 --- a/feed_to_vespa.py +++ b/feed_to_vespa.py @@ -3,11 +3,13 @@ import json import os +import re +import subprocess import sys import yaml import requests from requests.adapters import HTTPAdapter, Retry - +import urllib.parse def find(json, path, separator = "."): if len(path) == 0: return json @@ -16,8 +18,8 @@ def find(json, path, separator = "."): # extract from form id:open:doc:: -def get_document_id(docid): - return docid[docid.rfind(":") + 1:] +def get_document_id(id): + return id[id.rfind(":")+1:] def get_private_key_path(): @@ -50,13 +52,10 @@ def vespa_delete(endpoint, operation, options): return session.delete(url).json() -def vespa_post(endpoint, doc, docid, namespace, doc_type): - url = "{0}/document/v1/{1}/{2}/docid/{3}".format(endpoint, namespace, doc_type, docid) - return session.post(url, json=doc).json() - - def vespa_visit(endpoint, namespace, doc_type, continuation = None): - options = ["wantedDocumentCount=500", "timeout=60s"] + options = [] + options.append("wantedDocumentCount=500") + options.append("timeout=60s") if continuation is not None and len(continuation) > 0: options.append("&continuation={0}".format(continuation)) return vespa_get(endpoint, "document/v1/{0}/{1}/docid".format(namespace,doc_type), options) @@ -65,19 +64,15 @@ def vespa_visit(endpoint, namespace, doc_type, continuation = None): def vespa_remove(endpoint, doc_ids, namespace, doc_type): options = [] for doc_id in doc_ids: - docid = get_document_id(doc_id) - vespa_delete(endpoint, "document/v1/{0}/{1}/docid/{2}".format(namespace, doc_type, docid), options) + id = get_document_id(doc_id) + vespa_delete(endpoint, "document/v1/{0}/{1}/docid/{2}".format(namespace, doc_type, id), options) def vespa_feed(endpoint, feed, namespace, doc_type): - for doc in get_docs(feed): - if doc_type == "doc": - document_id = find(doc, "fields.namespace") + find(doc, "fields.path") - elif doc_type == "term": - document_id = str(find(doc, "fields.hash")) - elif doc_type == "paragraph": - document_id = get_document_id(doc['put']) - print(vespa_post(endpoint, doc, document_id, namespace, doc_type)) + if doc_type == "paragraph" or doc_type == "term" or doc_type == "doc": + splits = re.split(r'/|\.', endpoint) + app_string = splits[3] + '.' + splits[2] + print(subprocess.run(['./vespa', 'feed', '-a', app_string, '-t', endpoint, feed], capture_output=True)) def get_docs(index): @@ -89,12 +84,18 @@ def get_indexed_docids(endpoint, namespace, doc_type): docids = set() continuation = "" while continuation is not None: - visit_json = vespa_visit(endpoint, namespace, doc_type, continuation) - documents = find(visit_json, "documents") + json = vespa_visit(endpoint, namespace, doc_type, continuation) + documents = find(json, "documents") if documents is not None: - ids = [find(document, "id") for document in documents] - docids.update(ids) - continuation = find(visit_json, "continuation") + ids = [ find(document, "id") for document in documents ] + for id in ids: + # The document id might contain chars that needs to be escaped for the delete/put operation to work + # also for comparison with what is in the feed + docid = get_document_id(id) # return the last part + encoded = urllib.parse.quote(docid) #escape + id = id.replace(docid, encoded) + docids.add(id) + continuation = find(json, "continuation") return docids @@ -154,8 +155,8 @@ def update_endpoint(endpoint, config): docids_to_remove = docids_in_index.difference(docids_in_feed) if len(docids_to_remove) > 0: print_header("Removing indexed documents not in feed in {0}".format(endpoint_url)) - for docid in docids_to_remove: - print("To Remove: {0}".format(docid)) + for id in docids_to_remove: + print("To Remove: {0}".format(id)) vespa_remove(endpoint_url, docids_to_remove, namespace, doc_type) print("{0} documents removed.".format(len(docids_to_remove))) else: @@ -175,7 +176,7 @@ def main(): session = requests.Session() retries = Retry(total=10, connect=10, backoff_factor=0.8, - status_forcelist=[500, 503, 504, 429] + status_forcelist=[ 500, 503, 504, 429 ] ) session.mount('https://', HTTPAdapter(max_retries=retries)) session.cert = (get_public_cert_path(), get_private_key_path()) From e0f6246e90de25e2d4792355c47a22b3898383fd Mon Sep 17 00:00:00 2001 From: Kristian Aune Date: Wed, 6 Sep 2023 13:35:53 +0200 Subject: [PATCH 2/2] remove noise --- .github/workflows/feed.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/feed.yml b/.github/workflows/feed.yml index eead7420..2462c5c7 100644 --- a/.github/workflows/feed.yml +++ b/.github/workflows/feed.yml @@ -34,6 +34,8 @@ jobs: # Strip jekyll liquid macros sed -i.orig 's/{%/{ %/g; s/%}/% }/g; s/{{/{ {/g; s/}}/} }/g' docs/sphinx/build/*.html ; rm docs/sphinx/build/*.orig bundle exec jekyll build -s docs/sphinx/build/ -p _plugins-vespafeed --config _config.yml + # Remove files that should not be in the search index + rm _site/search.html _site/genindex.html - name: Setup Python uses: actions/setup-python@v4