Advanced pagination (#433)

* Allow users to handle invalid nodes coming from a search against the API * fix issues and shorten test * first steps * trying somehting * add exception raising for uninitialized API usage * add more documentation for user * add some advanced features to the paginator * fix oversight * fixing a problem where empty search raise an error * add a test to hit empty searches
C-Accel-CRIPT · Mar 11, 2024 · 08f6385 · 08f6385
1 parent c596cd7
commit 08f6385
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 17 deletions.
diff --git a/src/cript/api/api.py b/src/cript/api/api.py
@@ -59,10 +59,10 @@ class API:
     _host: str = ""
     _api_token: str = ""
     _storage_token: str = ""
-    _http_headers: dict = {}
     _db_schema: Optional[DataSchema] = None
     _api_prefix: str = "api"
     _api_version: str = "v1"
+    _api_request_session: Union[None, requests.Session] = None
 
     # trunk-ignore-begin(cspell)
     # AWS S3 constants
@@ -213,9 +213,6 @@ def __init__(self, host: Union[str, None] = None, api_token: Union[str, None] =
         self._api_token = api_token  # type: ignore
         self._storage_token = storage_token  # type: ignore
 
-        # add Bearer to token for HTTP requests
-        self._http_headers = {"Authorization": f"Bearer {self._api_token}", "Content-Type": "application/json"}
-
         # set a logger instance to use for the class logs
         self._init_logger(default_log_level)
 
@@ -322,6 +319,14 @@ def connect(self):
         CRIPTConnectionError
             raised when the host does not give the expected response
         """
+
+        # Establish a requests session object
+        if self._api_request_session:
+            self.disconnect()
+        self._api_request_session = requests.Session()
+        # add Bearer to token for HTTP requests
+        self._api_request_session.headers = {"Authorization": f"Bearer {self._api_token}", "Content-Type": "application/json"}
+
         # As a form to check our connection, we pull and establish the data schema
         try:
             self._db_schema = DataSchema(self)
@@ -344,6 +349,10 @@ def disconnect(self):
 
         For manual connection: nested API object are discouraged.
         """
+        # Disconnect request session
+        if self._api_request_session:
+            self._api_request_session.close()
+
         # Restore the previously active global API (might be None)
         global _global_cached_api
         _global_cached_api = self._previous_global_cached_api
@@ -946,7 +955,7 @@ def delete_node_by_uuid(self, node_type: str, node_uuid: str) -> None:
 
         self.logger.info(f"Deleted '{node_type.title()}' with UUID of '{node_uuid}' from CRIPT API.")
 
-    def _capsule_request(self, url_path: str, method: str, api_request: bool = True, headers: Optional[Dict] = None, timeout: int = _API_TIMEOUT, **kwargs) -> requests.Response:
+    def _capsule_request(self, url_path: str, method: str, api_request: bool = True, timeout: int = _API_TIMEOUT, **kwargs) -> requests.Response:
         """Helper function that capsules every request call we make against the backend.
 
         Please *always* use this methods instead of `requests` directly.
@@ -971,9 +980,6 @@ def _capsule_request(self, url_path: str, method: str, api_request: bool = True,
           additional keyword arguments that are passed to `request.request`
         """
 
-        if headers is None:
-            headers = self._http_headers
-
         url: str = self.host
         if api_request:
             url += f"/{self.api_prefix}/{self.api_version}"
@@ -985,7 +991,9 @@ def _capsule_request(self, url_path: str, method: str, api_request: bool = True,
         pre_log_message += "..."
         self.logger.debug(pre_log_message)
 
-        response: requests.Response = requests.request(url=url, method=method, headers=headers, timeout=timeout, **kwargs)
+        if self._api_request_session is None:
+            raise CRIPTAPIRequiredError
+        response: requests.Response = self._api_request_session.request(url=url, method=method, timeout=timeout, **kwargs)
         post_log_message: str = f"Request return with {response.status_code}"
         if self.extra_api_log_debug_info:
             post_log_message += f" {response.text}"

diff --git a/src/cript/api/exceptions.py b/src/cript/api/exceptions.py
@@ -68,6 +68,8 @@ class CRIPTAPIRequiredError(CRIPTException):
     """
     ## Definition
     Exception to be raised when the API object is requested, but no cript.API object exists yet.
+    Also make sure to use it in a context manager `with cript.API as api:` or  manually call
+    `connect` and `disconnect`.
 
     The CRIPT Python SDK relies on a cript.API object for creation, validation, and modification of nodes.
     The cript.API object may be explicitly called by the user to perform operations to the API, or
@@ -83,14 +85,21 @@ class CRIPTAPIRequiredError(CRIPTException):
     my_token = "123456" # To use your token securely, please consider using environment variables
 
     my_api = cript.API(host=my_host, token=my_token)
+    my_api.connect()
+    # Your code
+    my_api.disconnect()
     ```
     """
 
     def __init__(self):
         pass
 
     def __str__(self) -> str:
-        error_message = "cript.API object is required for an operation, but it does not exist." "Please instantiate a cript.API object to continue." "See the documentation for more details."
+        error_message = (
+            "cript.API object is required for an operation, but it does not exist."
+            "Please instantiate a cript.API object and connect it to API for example with a context manager `with cript.API() as api:` to continue."
+            "See the documentation for more details."
+        )
 
         return error_message
 

diff --git a/src/cript/api/paginator.py b/src/cript/api/paginator.py
@@ -33,6 +33,8 @@ class Paginator:
     _current_position: int
     _fetched_nodes: list
     _number_fetched_pages: int = 0
+    _limit_page_fetches: Union[int, None] = None
+    _num_skip_pages: int = 0
     auto_load_nodes: bool = True
 
     @beartype
@@ -103,11 +105,15 @@ def _fetch_next_page(self) -> None:
              None
         """
 
+        # Check if we are supposed to fetch more pages
+        if self._limit_page_fetches and self._number_fetched_pages >= self._limit_page_fetches:
+            raise StopIteration
+
         # Composition of the query URL
         temp_url_path: str = self._url_path
         temp_url_path += f"/?q={self._query}"
         if self._initial_page_number is not None:
-            temp_url_path += f"&page={self._initial_page_number + self._number_fetched_pages}"
+            temp_url_path += f"&page={self.page_number}"
         self._number_fetched_pages += 1
 
         response: requests.Response = self._api._capsule_request(url_path=temp_url_path, method="GET")
@@ -136,9 +142,9 @@ def _fetch_next_page(self) -> None:
 
         if api_response["code"] == 404 and api_response["error"] == "The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.":
             current_page_results = []
-
+            self._api.logger.debug(f"The paginator hit a 404 HTTP for requesting this {temp_url_path} with GET. We interpret it as no nodes present, but this is brittle at the moment.")
         # if API response is not 200 raise error for the user to debug
-        if api_response["code"] != 200:
+        elif api_response["code"] != 200:
             raise APIError(api_error=str(response.json()), http_method="GET", api_url=temp_url_path)
 
         # Here we only load the JSON into the temporary results.
@@ -174,3 +180,66 @@ def __next__(self):
     def __iter__(self):
         self._current_position = 0
         return self
+
+    @property
+    def page_number(self) -> Union[int, None]:
+        """Obtain the current page number the paginator is fetching next.
+
+        Returns
+        -------
+        int
+          positive number of the next page this paginator is fetching.
+        None
+          if no page number is associated with the pagination
+        """
+        page_number = self._num_skip_pages + self._number_fetched_pages
+        if self._initial_page_number is not None:
+            page_number += self._initial_page_number
+        return page_number
+
+    @beartype
+    def limit_page_fetches(self, max_num_pages: Union[int, None]) -> None:
+        """Limit pagination to a maximum number of pages.
+
+        This can be used for very large searches with the paginator, so the search can be split into
+        smaller portions.
+
+        Parameters
+        ----------
+        max_num_pages: Union[int, None],
+          positive integer with maximum number of page fetches.
+          or None, indicating unlimited number of page fetches are permitted.
+        """
+        self._limit_page_fetches = max_num_pages
+
+    def skip_pages(self, skip_pages: int) -> int:
+        """Skip pages in the pagination.
+
+        Warning this function is advanced usage and may not produce the results you expect.
+        In particular, every search is different, even if we search for the same values there is
+        no guarantee that the results are in the same order. (And results can change if data is
+        added or removed from CRIPT.) So if you break up your search with `limit_page_fetches` and
+        `skip_pages` there is no guarantee that it is the same as one continuous search.
+        If the paginator associated search does not accept pages, there is no effect.
+
+        Parameters
+        ----------
+        skip_pages:int
+          Number of pages that the paginator skips now before fetching the next page.
+          The parameter is added to the internal state, so repeated calls skip more pages.
+
+        Returns
+        -------
+        int
+          The number this paginator is skipping. Internal skip count.
+
+        Raises
+        ------
+        RuntimeError
+          If the total number of skipped pages is negative.
+        """
+        num_skip_pages = self._num_skip_pages + skip_pages
+        if self._num_skip_pages < 0:
+            RuntimeError(f"Invalid number of skipped pages. The total number of pages skipped is negative {num_skip_pages}, requested to skip {skip_pages}.")
+        self._num_skip_pages = num_skip_pages
+        return self._num_skip_pages
diff --git a/tests/api/test_search.py b/tests/api/test_search.py
@@ -24,6 +24,8 @@ def test_api_search_node_type(cript_api: cript.API) -> None:
 
     # test search results
     assert isinstance(materials_paginator, Paginator)
+    materials_paginator.skip_pages(3)
+    materials_paginator.limit_page_fetches(3)
     materials_list = []
     while True:
         try:
@@ -39,12 +41,9 @@ def test_api_search_node_type(cript_api: cript.API) -> None:
                 materials_paginator.auto_load_nodes = True
         except StopIteration:
             break
-        # We don't need to search for a million pages here.
-        if materials_paginator._number_fetched_pages > 6:
-            break
 
     # Assure that we paginated more then one page
-    assert materials_paginator._number_fetched_pages > 0
+    assert materials_paginator.page_number == 6
     assert len(materials_list) > 5
     first_page_first_result = materials_list[0].name
     # just checking that the word has a few characters in it
@@ -104,6 +103,25 @@ def test_api_search_uuid(cript_api: cript.API, dynamic_material_data) -> None:
     assert str(uuid_list[0].uuid) == dynamic_material_data["uuid"]
 
 
+@pytest.mark.skipif(not HAS_INTEGRATION_TESTS_ENABLED, reason="requires a real cript_api_token")
+def test_empty_paginator(cript_api: cript.API) -> None:
+    non_existent_name = "This is an nonsensical name for a material and should never exist. %^&*()_"
+    exact_name_paginator = cript_api.search(node_type=cript.Material, search_mode=cript.SearchModes.EXACT_NAME, value_to_search=non_existent_name)
+    with pytest.raises(StopIteration):
+        next(exact_name_paginator)
+    exact_name_paginator.auto_load_nodes = False
+    with pytest.raises(StopIteration):
+        next(exact_name_paginator)
+
+    # Special 0 UUID should not exist
+    uuid_paginator = cript_api.search(node_type=cript.Material, search_mode=cript.SearchModes.UUID, value_to_search="00000000-0000-0000-0000-000000000000")
+    with pytest.raises(StopIteration):
+        next(uuid_paginator)
+    exact_name_paginator.auto_load_nodes = False
+    with pytest.raises(StopIteration):
+        next(uuid_paginator)
+
+
 @pytest.mark.skipif(not HAS_INTEGRATION_TESTS_ENABLED, reason="requires a real cript_api_token")
 def test_api_search_bigsmiles(cript_api: cript.API) -> None:
     """