Skip to content

Commit

Permalink
Enhance performance when exporting data on endpoint `api/v1/data/<for…
Browse files Browse the repository at this point in the history
…m_id>.<format>` (#2460)

* remove default ordering by id when exporting data from Data ViewSet

the default ordering by id is making queries for run extremely slow when exporting large amounts of data.
to sort data by id in ascending order, the query parameter sort={"_id":1} will be used. For more info read https://github.com/onaio/onadata/blob/main/docs/data.rst#sort-submitted-data-of-a-specific-form-using-existing-fields

* remove duplicate db queries on endpoint /api/v1/data/<form_id>.csv

* fix failing test

* fix flaky test

* fix flaky test

* fix flaky test

* remove futile test case

Notes are excluded when exporting the csv. The column _notes is usually added in the CSV but its always overriden to be blank as per the implmentation in the CSVDataFrameBuilder class. So this test case is futile

* use query param to sort exported data in XFormViewSet

* disable pylint rule too-many-line for file

* fix flaky tests

fix flaky tests by ensuring queryset is always ordered

* order records in test cases to avoid flaky results

* optimize performance by removing redundant implementations

* stop tracking onadata/test_data_media

* disable pylint rules

fixing the rules would require alot of refactor so disabling the rules will suffice for now

* disable linting warning

* address lint warning

* address lint warnings

* refactor code

* address failing tests

* address lint warnings

* address linting error

* address lint warning

* fix failing test

* refactor code

* fix failing tests

* refactor

* suppress lint warning

* get sort param from request

* remove unused arg

* update documentation

* update documentation

* sort paginated data by id

* address linting error line too long

* fix failing test

* update test case
  • Loading branch information
kelvin-muchiri authored Aug 23, 2023
1 parent 88f3b00 commit 3fa23c5
Show file tree
Hide file tree
Showing 22 changed files with 2,323 additions and 1,819 deletions.
16 changes: 15 additions & 1 deletion docs/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,9 @@ Sample response with link header

Sort submitted data of a specific form using existing fields
-------------------------------------------------------------
Provides a sorted list of json submitted data for a specific form by specifing the order in which the query returns matching data. Use the `sort` parameter to filter the list of submissions.The sort parameter has field and value pairs.
Provides a sorted list of json submitted data for a specific form by specifing the order in which the query returns matching data.

No ordering is applied by default -- the data is returned in any arbitrary order. Use the `sort` parameter to filter the list of submissions. The sort parameter has field and value pairs.

::

Expand All @@ -390,6 +392,18 @@ Descending sort query using the age field:

{"age":-1}

Query sorted by id field in ascending.

::

{"_id":1}

Query sorted by id field in descending.

::

{"_id":-1}


Example of Ascending Sort
^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
212 changes: 194 additions & 18 deletions onadata/apps/api/tests/viewsets/test_data_viewset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import json
import logging
import os
import csv
from io import StringIO
from builtins import open
from datetime import timedelta
from tempfile import NamedTemporaryFile
Expand Down Expand Up @@ -115,6 +117,7 @@ class TestDataViewSet(SerializeMixin, TestBase):
"""
Test /data API endpoint implementation.
"""

lockfile = __file__

def setUp(self):
Expand Down Expand Up @@ -269,6 +272,51 @@ def test_numeric_types_are_rendered_as_required(self):
self.assertEqual(response.data[0].get("net_worth"), 100000.00)
self.assertEqual(response.data[0].get("imei"), "351746052009472")

def test_fields_query_params(self):
"""fields query params works"""
view = DataViewSet.as_view({"get": "list"})
fixture_dir = os.path.join(self.this_directory, "fixtures", "csv_export")
form_path = os.path.join(fixture_dir, "tutorial_w_repeats.xlsx")
self._publish_xls_file_and_set_xform(form_path)
submission_path = os.path.join(fixture_dir, "repeats_sub.xml")

for _ in range(102):
self._make_submission(submission_path)

fields_query = {"fields": '["_id", "_status"]'}
request = self.factory.get("/", data=fields_query, **self.extra)
response = view(request, pk=self.xform.id)
self.assertEqual(response.status_code, 200)
self.assertEqual(list(response.data[0].keys()), ["_id", "_status"])

# With pagination
instances = self.xform.instances.all().order_by("pk")
# Page 1
request = self.factory.get(
"/",
data={**fields_query, "page": 1, "page_size": 100},
**self.extra,
)
response = view(request, pk=self.xform.id)
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 100)

for index, instance in enumerate(instances[:100]):
self.assertEqual(response.data[index]["_id"], instance.pk)

# Page 2
request = self.factory.get(
"/",
data={**fields_query, "page": 2, "page_size": 100},
**self.extra,
)
response = view(request, pk=self.xform.id)
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 2)

for index, instance in enumerate(instances[100:101]):
self.assertEqual(response.data[index]["_id"], instance.pk)

def test_data_jsonp(self):
self._make_submissions()
view = DataViewSet.as_view({"get": "list"})
Expand Down Expand Up @@ -1118,11 +1166,11 @@ def test_data_with_query_parameter(self):
second_datetime = start_time + timedelta(days=1, hours=20)

query_str = (
'{"_submission_time": {"$gte": "' +
first_datetime +
'", "$lte": "' +
second_datetime.strftime(MONGO_STRFTIME) +
'"}}'
'{"_submission_time": {"$gte": "'
+ first_datetime
+ '", "$lte": "'
+ second_datetime.strftime(MONGO_STRFTIME)
+ '"}}'
)

request = self.factory.get("/?query=%s" % query_str, **self.extra)
Expand Down Expand Up @@ -1679,7 +1727,6 @@ def test_post_save_signal_on_submission_deletion(self, mock, send_message_mock):

@patch("onadata.apps.api.viewsets.data_viewset.send_message")
def test_deletion_of_bulk_submissions(self, send_message_mock):

self._make_submissions()
self.xform.refresh_from_db()
formid = self.xform.pk
Expand Down Expand Up @@ -1756,7 +1803,7 @@ def test_submissions_permanent_deletion(self, send_message_mock):
self.xform.refresh_from_db()
self.assertEqual(self.xform.num_of_submissions, 3)
self.assertEqual(self.xform.instances.count(), 3)

# Test project details updated successfully
self.assertEqual(
self.xform.project.date_modified.strftime("%Y-%m-%d %H:%M:%S"),
Expand Down Expand Up @@ -1865,7 +1912,9 @@ def test_permanent_instance_delete_inactive_form(self, send_message_mock):

dataid = self.xform.instances.filter(deleted_at=None).order_by("id")[0].pk

request = self.factory.delete("/", **self.extra, data={"permanent_delete": True})
request = self.factory.delete(
"/", **self.extra, data={"permanent_delete": True}
)
response = view(request, pk=formid, dataid=dataid)

self.assertEqual(response.status_code, 204)
Expand Down Expand Up @@ -2330,24 +2379,24 @@ def test_instances_with_empty_geopoints(self, mock_signal):
self.assertEqual(response.status_code, 200)
self.assertEqual(
json.dumps(response.data)[:94],
'{"type": "FeatureCollection", "features":' +
' [{"type": "Feature", "geometry": null, "properties":')
self.assertEqual(len(response.data['features']), 1)
feature = dict(response.data['features'][0])
self.assertEqual(feature['type'], 'Feature')
self.assertEqual(feature['geometry'], None)
self.assertTrue(isinstance(feature['properties'], dict))
'{"type": "FeatureCollection", "features":'
+ ' [{"type": "Feature", "geometry": null, "properties":',
)
self.assertEqual(len(response.data["features"]), 1)
feature = dict(response.data["features"][0])
self.assertEqual(feature["type"], "Feature")
self.assertEqual(feature["geometry"], None)
self.assertTrue(isinstance(feature["properties"], dict))
self.assertEqual(self.xform.instances.count(), 2)
self.assertEqual(self.xform.polygon_xpaths(), ['shape'])
self.assertEqual(self.xform.geotrace_xpaths(), ['path'])
self.assertEqual(self.xform.polygon_xpaths(), ["shape"])
self.assertEqual(self.xform.geotrace_xpaths(), ["path"])

# check if instances_with_geopoints is True for the form
self.xform.refresh_from_db()
self.assertTrue(self.xform.instances_with_geopoints)

@patch("onadata.apps.viewer.signals._post_process_submissions")
def test_instances_with_empty_geopoints_no_polygons(self, mock_signal):

# publish sample geo submissions
self._publish_submit_geojson(has_empty_geoms=True, only_geopoints=True)

Expand Down Expand Up @@ -3448,3 +3497,130 @@ def test_data_retrieve_instance_osm_format(self):
)
response = view(request, pk=formid)
self.assertEqual(len(response.data), 0)


@override_settings(MEDIA_ROOT=os.path.join(settings.PROJECT_ROOT, "test_data_media/"))
class ExportDataTestCase(SerializeMixin, TestBase):
"""Tests exporting data"""

lockfile = __file__

def setUp(self):
super().setUp()
self._create_user_and_login()
self._publish_transportation_form()
self.factory = RequestFactory()
self.extra = {"HTTP_AUTHORIZATION": "Token %s" % self.user.auth_token}
self.view = DataViewSet.as_view({"get": "list"})

def test_csv_export(self):
"""Data is exported as CSV"""
self._make_submissions()
formid = self.xform.pk
request = self.factory.get("/", data={"format": "csv"}, **self.extra)
response = self.view(request, pk=formid)
self.assertEqual(response.status_code, 200)
csv_file_obj = StringIO(
"".join([c.decode("utf-8") for c in response.streaming_content])
)
csv_reader = csv.reader(csv_file_obj)
headers = next(csv_reader)
expected_headers = [
"transport/available_transportation_types_to_referral_facility/ambulance",
"transport/available_transportation_types_to_referral_facility/bicycle",
"transport/available_transportation_types_to_referral_facility/boat_canoe",
"transport/available_transportation_types_to_referral_facility/bus",
"transport/available_transportation_types_to_referral_facility/donkey_mule_cart",
"transport/available_transportation_types_to_referral_facility/keke_pepe",
"transport/available_transportation_types_to_referral_facility/lorry",
"transport/available_transportation_types_to_referral_facility/motorbike",
"transport/available_transportation_types_to_referral_facility/taxi",
"transport/available_transportation_types_to_referral_facility/other",
"transport/available_transportation_types_to_referral_facility_other",
"transport/loop_over_transport_types_frequency/ambulance/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/bicycle/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/boat_canoe/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/bus/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/donkey_mule_cart/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/keke_pepe/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/lorry/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/motorbike/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/taxi/frequency_to_referral_facility",
"image1",
"meta/instanceID",
"_id",
"_uuid",
"_submission_time",
"_date_modified",
"_tags",
"_notes",
"_version",
"_duration",
"_submitted_by",
"_total_media",
"_media_count",
"_media_all_received",
]
self.assertEqual(headers, expected_headers)
number_records = len(list(csv_reader))
self.assertEqual(number_records, 4)

def test_sort_query_param(self):
"""sort query param works with exports"""

self._make_submissions()
formid = self.xform.pk
# sort csv export data by id in descending order
request = self.factory.get(
"/", data={"format": "csv", "sort": '{"_id": -1}'}, **self.extra
)
response = self.view(request, pk=formid)
self.assertEqual(response.status_code, 200)
csv_file_obj = StringIO(
"".join([c.decode("utf-8") for c in response.streaming_content])
)
csv_reader = csv.reader(csv_file_obj)
instances = Instance.objects.filter(xform_id=formid).order_by("-id")
self.assertEqual(instances.count(), 4)
headers = next(csv_reader)
expected_headers = [
"transport/available_transportation_types_to_referral_facility/ambulance",
"transport/available_transportation_types_to_referral_facility/bicycle",
"transport/available_transportation_types_to_referral_facility/boat_canoe",
"transport/available_transportation_types_to_referral_facility/bus",
"transport/available_transportation_types_to_referral_facility/donkey_mule_cart",
"transport/available_transportation_types_to_referral_facility/keke_pepe",
"transport/available_transportation_types_to_referral_facility/lorry",
"transport/available_transportation_types_to_referral_facility/motorbike",
"transport/available_transportation_types_to_referral_facility/taxi",
"transport/available_transportation_types_to_referral_facility/other",
"transport/available_transportation_types_to_referral_facility_other",
"transport/loop_over_transport_types_frequency/ambulance/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/bicycle/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/boat_canoe/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/bus/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/donkey_mule_cart/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/keke_pepe/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/lorry/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/motorbike/frequency_to_referral_facility",
"transport/loop_over_transport_types_frequency/taxi/frequency_to_referral_facility",
"image1",
"meta/instanceID",
"_id",
"_uuid",
"_submission_time",
"_date_modified",
"_tags",
"_notes",
"_version",
"_duration",
"_submitted_by",
"_total_media",
"_media_count",
"_media_all_received",
]
self.assertEqual(headers, expected_headers)
# csv records should be ordered by id in descending order
for instance in instances:
row = next(csv_reader)
self.assertEqual(str(instance.id), row[22])
36 changes: 0 additions & 36 deletions onadata/apps/api/tests/viewsets/test_note_viewset.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
import os
from datetime import datetime

from django.conf import settings
from django.utils.timezone import make_aware
from django.test import RequestFactory
from guardian.shortcuts import assign_perm

from onadata.apps.api.viewsets.note_viewset import NoteViewSet
from onadata.apps.api.viewsets.xform_viewset import XFormViewSet
from onadata.apps.logger.models import Note
from onadata.apps.main.tests.test_base import TestBase
from onadata.libs.serializers.note_serializer import NoteSerializer
Expand Down Expand Up @@ -186,36 +180,6 @@ def test_only_add_question_notes_to_existing_fields(self):
instance = self.xform.instances.all()[0]
self.assertEqual(len(instance.json["_notes"]), 0)

def test_csv_export_form_w_notes(self):
"""
Test CSV exports include notes for submissions that have notes.
"""
self._add_notes_to_data_point()
self._add_notes_to_data_point()

time = make_aware(datetime(2016, 7, 1))
for instance in self.xform.instances.all():
instance.date_created = time
instance.save()
instance.parsed_instance.save()

view = XFormViewSet.as_view({"get": "retrieve"})

request = self.factory.get("/", **self.extra)
response = view(request, pk=self.xform.pk, format="csv")
self.assertTrue(response.status_code, 200)

test_file_path = os.path.join(
settings.PROJECT_ROOT,
"apps",
"viewer",
"tests",
"fixtures",
"transportation_w_notes.csv",
)

self._test_csv_response(response, test_file_path)

def test_attribute_error_bug(self):
"""NoteSerializer: Should not raise AttributeError exeption"""
note = Note(note="Hello", instance=self._first_xform_instance)
Expand Down
2 changes: 1 addition & 1 deletion onadata/apps/api/tests/viewsets/test_tableau_viewset.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ def test_gt_id_query_param(self):
self.view = TableauViewSet.as_view({"get": "data"})
_open_data = get_or_create_opendata(self.xform)
uuid = _open_data[0].uuid
request = self.factory.get("/", data={"gt_id": 10000}, **self.extra)
request = self.factory.get("/", data={"gt_id": 100000}, **self.extra)
response = self.view(request, uuid=uuid)
self.assertEqual(response.status_code, 200)
row_data = streaming_data(response)
Expand Down
8 changes: 4 additions & 4 deletions onadata/apps/api/tests/viewsets/test_xform_viewset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3858,8 +3858,8 @@ def test_csv_export_with_win_excel_utf8(self):
# check that response has property 'has_hxl_support' which is true
self.assertEqual(response.status_code, 200)
self.assertTrue(response.data.get("has_hxl_support"))

data = {"win_excel_utf8": True}
# sort csv data in ascending order
data = {"win_excel_utf8": True, "sort": '{"_id": 1}'}
request = self.factory.get("/", data=data, **self.extra)
response = view(request, pk=self.xform.pk, format="csv")
self.assertEqual(response.status_code, 200)
Expand Down Expand Up @@ -3887,8 +3887,8 @@ def test_csv_export_with_win_excel_utf8(self):
filename = filename_from_disposition(content_disposition)
basename, ext = os.path.splitext(filename)
self.assertEqual(ext, ".csv")

data = {"win_excel_utf8": False}
# sort csv data in ascending order
data = {"win_excel_utf8": False, "sort": '{"_id": 1}'}
request = self.factory.get("/", data=data, **self.extra)
response = view(request, pk=self.xform.pk, format="csv")
self.assertEqual(response.status_code, 200)
Expand Down
Loading

0 comments on commit 3fa23c5

Please sign in to comment.