Skip to content

Commit

Permalink
Merge pull request #560 from vespa-engine/kkraune/feed
Browse files Browse the repository at this point in the history
Harmonize feed scripts MERGEOK
  • Loading branch information
kkraune authored Sep 6, 2023
2 parents 9059a2b + e0f6246 commit 58c580d
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 34 deletions.
39 changes: 32 additions & 7 deletions .github/workflows/feed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,23 @@ on:
branches: [ master ]

env:
DATA_PLANE_PUBLIC_KEY : ${{ secrets.VESPA_TEAM_DATA_PLANE_PUBLIC_CERT }}
DATA_PLANE_PRIVATE_KEY: ${{ secrets.VESPA_TEAM_DATA_PLANE_PRIVATE_KEY }}
DATA_PLANE_PUBLIC_KEY : ${{ secrets.VESPA_TEAM_DATA_PLANE_PUBLIC_CERT }}
DATA_PLANE_PRIVATE_KEY : ${{ secrets.VESPA_TEAM_DATA_PLANE_PRIVATE_KEY }}
VESPA_CLI_DATA_PLANE_CERT : ${{ secrets.VESPA_TEAM_VESPA_CLI_DATA_PLANE_CERT }}
VESPA_CLI_DATA_PLANE_KEY : ${{ secrets.VESPA_TEAM_VESPA_CLI_DATA_PLANE_KEY }}

jobs:
build:
runs-on: ubuntu-latest
steps:

- uses: actions/checkout@v3
- uses: actions/checkout@v4

- uses: ruby/setup-ruby@v1
with:
ruby-version: 3.1
bundler-cache: true

- uses: actions/setup-python@v4
with:
python-version: '3.x'

- name: Install Sphinx
run: |
sudo apt-get install -y pandoc
Expand All @@ -36,20 +34,47 @@ jobs:
# Strip jekyll liquid macros
sed -i.orig 's/{%/{ %/g; s/%}/% }/g; s/{{/{ {/g; s/}}/} }/g' docs/sphinx/build/*.html ; rm docs/sphinx/build/*.orig
bundle exec jekyll build -s docs/sphinx/build/ -p _plugins-vespafeed --config _config.yml
# Remove files that should not be in the search index
rm _site/search.html _site/genindex.html
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.x'

- name: Install dependencies
run: |
pip3 install PyYAML spacy mmh3 requests html5lib beautifulsoup4 markdownify tiktoken
- name: Get Vespa CLI - update to later versions as needed
run: |
apt update && apt -y install curl
curl -SsLo vespa-cli.tar.gz https://github.com/vespa-engine/vespa/releases/download/v8.209.11/vespa-cli_8.209.11_linux_amd64.tar.gz
tar -xvf vespa-cli.tar.gz && ln -fs vespa-cli_8.209.11_linux_amd64/bin/vespa
- name: Feed site
run: |
# The python scripts below uses the Vespa CLI for feeding / data access.
# See https://docs.vespa.ai/en/vespa-cli.html.
# The environment variables below have credentials for endpoint access -
# use the key/cert files in .vespa and paste their content into GitHub Secrets.
# VESPA_CLI_API_KEY must be set and empty as below.
export VESPA_CLI_DATA_PLANE_CERT
export VESPA_CLI_DATA_PLANE_KEY
export VESPA_CLI_API_KEY=
./feed_to_vespa.py _config.yml
- name: Feed paragraphs site
run: |
export VESPA_CLI_DATA_PLANE_CERT
export VESPA_CLI_DATA_PLANE_KEY
export VESPA_CLI_API_KEY=
./feed-split.py pyvespa_index.json https://pyvespa.readthedocs.io questions.jsonl
./feed_to_vespa.py _paragraphs_config.yml
- name: Feed suggestions
run: |
export VESPA_CLI_DATA_PLANE_CERT
export VESPA_CLI_DATA_PLANE_KEY
export VESPA_CLI_API_KEY=
./feed_to_vespa.py _suggestions_config.yml
55 changes: 28 additions & 27 deletions feed_to_vespa.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@

import json
import os
import re
import subprocess
import sys
import yaml
import requests
from requests.adapters import HTTPAdapter, Retry

import urllib.parse

def find(json, path, separator = "."):
if len(path) == 0: return json
Expand All @@ -16,8 +18,8 @@ def find(json, path, separator = "."):


# extract <id> from form id:open:doc::<id>
def get_document_id(docid):
return docid[docid.rfind(":") + 1:]
def get_document_id(id):
return id[id.rfind(":")+1:]


def get_private_key_path():
Expand Down Expand Up @@ -50,13 +52,10 @@ def vespa_delete(endpoint, operation, options):
return session.delete(url).json()


def vespa_post(endpoint, doc, docid, namespace, doc_type):
url = "{0}/document/v1/{1}/{2}/docid/{3}".format(endpoint, namespace, doc_type, docid)
return session.post(url, json=doc).json()


def vespa_visit(endpoint, namespace, doc_type, continuation = None):
options = ["wantedDocumentCount=500", "timeout=60s"]
options = []
options.append("wantedDocumentCount=500")
options.append("timeout=60s")
if continuation is not None and len(continuation) > 0:
options.append("&continuation={0}".format(continuation))
return vespa_get(endpoint, "document/v1/{0}/{1}/docid".format(namespace,doc_type), options)
Expand All @@ -65,19 +64,15 @@ def vespa_visit(endpoint, namespace, doc_type, continuation = None):
def vespa_remove(endpoint, doc_ids, namespace, doc_type):
options = []
for doc_id in doc_ids:
docid = get_document_id(doc_id)
vespa_delete(endpoint, "document/v1/{0}/{1}/docid/{2}".format(namespace, doc_type, docid), options)
id = get_document_id(doc_id)
vespa_delete(endpoint, "document/v1/{0}/{1}/docid/{2}".format(namespace, doc_type, id), options)


def vespa_feed(endpoint, feed, namespace, doc_type):
for doc in get_docs(feed):
if doc_type == "doc":
document_id = find(doc, "fields.namespace") + find(doc, "fields.path")
elif doc_type == "term":
document_id = str(find(doc, "fields.hash"))
elif doc_type == "paragraph":
document_id = get_document_id(doc['put'])
print(vespa_post(endpoint, doc, document_id, namespace, doc_type))
if doc_type == "paragraph" or doc_type == "term" or doc_type == "doc":
splits = re.split(r'/|\.', endpoint)
app_string = splits[3] + '.' + splits[2]
print(subprocess.run(['./vespa', 'feed', '-a', app_string, '-t', endpoint, feed], capture_output=True))


def get_docs(index):
Expand All @@ -89,12 +84,18 @@ def get_indexed_docids(endpoint, namespace, doc_type):
docids = set()
continuation = ""
while continuation is not None:
visit_json = vespa_visit(endpoint, namespace, doc_type, continuation)
documents = find(visit_json, "documents")
json = vespa_visit(endpoint, namespace, doc_type, continuation)
documents = find(json, "documents")
if documents is not None:
ids = [find(document, "id") for document in documents]
docids.update(ids)
continuation = find(visit_json, "continuation")
ids = [ find(document, "id") for document in documents ]
for id in ids:
# The document id might contain chars that needs to be escaped for the delete/put operation to work
# also for comparison with what is in the feed
docid = get_document_id(id) # return the last part
encoded = urllib.parse.quote(docid) #escape
id = id.replace(docid, encoded)
docids.add(id)
continuation = find(json, "continuation")
return docids


Expand Down Expand Up @@ -154,8 +155,8 @@ def update_endpoint(endpoint, config):
docids_to_remove = docids_in_index.difference(docids_in_feed)
if len(docids_to_remove) > 0:
print_header("Removing indexed documents not in feed in {0}".format(endpoint_url))
for docid in docids_to_remove:
print("To Remove: {0}".format(docid))
for id in docids_to_remove:
print("To Remove: {0}".format(id))
vespa_remove(endpoint_url, docids_to_remove, namespace, doc_type)
print("{0} documents removed.".format(len(docids_to_remove)))
else:
Expand All @@ -175,7 +176,7 @@ def main():
session = requests.Session()
retries = Retry(total=10, connect=10,
backoff_factor=0.8,
status_forcelist=[500, 503, 504, 429]
status_forcelist=[ 500, 503, 504, 429 ]
)
session.mount('https://', HTTPAdapter(max_retries=retries))
session.cert = (get_public_cert_path(), get_private_key_path())
Expand Down

0 comments on commit 58c580d

Please sign in to comment.