From 83c371284fbf3f4bb6409098d06caa0dc52a38cc Mon Sep 17 00:00:00 2001 From: Sharon Lifshitz Date: Sun, 29 Sep 2019 11:45:09 +0300 Subject: [PATCH] [DOC-REVIEW] Frames README (#2) (#292) * [DOC-REVIEW] Frames README (#2) * [DOC-REVIEW] Frames README (#3) (PR 292 #2) * [DOC-REVIEW] Frames README: Fix previous commit (#3) (PR 292 #2b) * [DOC-REVIEW] Frames README: Add client constructor Return Value (#3) (PR 292 #2c) * [DOC-REVIEW] Frames README - create: Edit address param doc & add internal info for schema param (#3) (PR 292 #2d) * [DOC-REVIEW] Frames client help-text review edit (PR 292 #3) * [DOC-REVIEW] Frames README & help text: Edit the execute method description (PR 292 #4) * [DOC-REVIEW] Frames client help-text: fix long-lines errors (PR 292 #4a) --- README.md | 664 +++++++++++++++++++++++------ clients/py/v3io_frames/__init__.py | 31 +- clients/py/v3io_frames/client.py | 97 +++-- 3 files changed, 617 insertions(+), 175 deletions(-) diff --git a/README.md b/README.md index c1be217f..9cdb79e4 100644 --- a/README.md +++ b/README.md @@ -4,221 +4,627 @@ [![GoDoc](https://godoc.org/github.com/v3io/frames?status.svg)](https://godoc.org/github.com/v3io/frames) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -V3IO Frames is a high-speed server and client library for accessing time-series (TSDB), NoSQL, and streaming data in the [Iguazio Data Science Platform](https://www.iguazio.com). +V3IO Frames (**"Frames"**) is a multi-model open-source data-access library, developed by Iguazio, which provides a unified high-performance DataFrame API for working with data in the data store of the [Iguazio Data Science Platform](https://www.iguazio.com) (**"the platform"**). -## Documentation +#### In This Document -Frames currently supports 3 backends and basic CRUD functionality for each. +- [Client Python API Reference](#client-python-api-reference) +- [Contributing](#contributing) +- [LICENSE](#license) -Supported Backends: -1. TSDB -2. KV -3. Stream -4. CSV - for testing purposes + +## Client Python API Reference +- [Overview](#overview) +- [User Authentication](#user-authentication) +- [Client Constructor](#client-constructor) +- [Common Client Method Parameters](#client-common-method-params) +- [create Method](#method-create) +- [write Method](#method-write) +- [read Method](#method-read) +- [delete Method](#method-delete) +- [execute Method](#method-execute) -All of frames operations are executed via the `client` object. To create a client object simply provide the Iguazio web-api endpoint and optional credentials. + +### Overview + +To use Frames, you first need to import the **v3io_frames** Python library. +For example: ```python import v3io_frames as v3f -client = v3f.Client('framesd:8081', user='user1', password='pass') ``` -Note: When running from within the managed jupyter notebook on the iguazio platform there is no need to add credentials as this is handled by the platform. -Next, for every operation we need to provide a `backend`, and a `table` parameters and optionally other function specific arguments. +Then, you need to create and initialize an instance of the `Client` class; see [Client Constructor](#client-constructor). +You can then use the client methods to perform different data operations on the supported backend types: + + +#### Client Methods + +The `Client` class features the following methods for supporting basic data operations: + +- [`create`](#method-create) — creates a new TSDB table or a stream ("backend data"). +- [`delete`](#method-delete) — deletes a table or stream or specific table items +- [`read`](#method-read) — reads data from a table or stream into pandas DataFrames. +- [`write`](#method-write) — writes data from pandas DataFrames to a table or stream. +- [`execute`](#method-execute) — executes a backend-specific command on a table or stream. + Each backend may support multiple commands. + + +#### Backend Types + +All Frames client methods receive a [`backend`](#client-method-param-backend) parameter for setting the Frames backend type. +Frames supports the following backend types: + +- `kv` — a platform NoSQL (key/value) table. +- `stream` — a platform data stream. +- `tsdb` — a time-series database (TSDB). +- `csv` — a comma-separated-value (CSV) file. + This backend type is used only for testing purposes. + +> **Note:** Some method parameters are common to all backend types and some are backend-specific, as detailed in this reference. + + +### User Authentication + +When creating a Frames client, you must provide valid platform credentials for accessing the backend data, which Frames will use to identify the identity of the user. +This can be done by using any of the following alternative methods (documented in order of precedence): + +- Provide the authentication credentials in the [`Client` constructor parameters](#client-constructor-parameters) by using either of the following methods: + + - Set the [`token`](#client-param-token) constructor parameter to a valid platform access key with the required data-access permissions. + - Set the [`user`](#client-param-user) and [`password`](#client-param-password) constructor parameters to the username and password of a platform user with the required data-access permissions. +
+ + > **Note:** You can't use both methods concurrently: setting both the `token` and `username` and `password` parameters in the same constructor call will produce an error. + +- Set the authentication credentials in environment variables, by using either of the following methods: + + - Set the `V3IO_ACCESS_KEY` environment variable to a valid platform access key with the required data-access permissions. + - Set the `V3IO_USERNAME` and `V3IO_PASSWORD` environment variables to the username and password of a platform user with the required data-access permissions. +
+ + > **Note:** + > - When the client constructor is called with [authentication parameters](#user-auth-client-const-params), the authentication-credentials environment variables (if defined) are ignored. + > - When `V3IO_ACCESS_KEY` is defined, `V3IO_USERNAME` and `V3IO_PASSWORD` are ignored. + > - The platform's Jupyter Notebook service automatically defines the `V3IO_ACCESS_KEY` environment variable and initializes it to a valid access key for the running user of the service. + + +### Client Constructor + +All Frames operations are executed via an object of the `Client` class. + +- [Syntax](#client-constructor-syntax) +- [Parameters and Data Members](#client-constructor-parameters) +- [Return Value](#client-constructor-return-value) +- [Example](#client-constructor-example) + + +#### Syntax -### Create -Creates a new table for the desired backend. Not all backends require a table to be created prior to ingestion. For example KV table will be created while ingesting new data, on the other hand since TSDB tables have mandatory fields we need to create a table before ingesting new data. ```python -client.create(backend=, table=, attrs=) +Client(address='', container='', user='', password='', token='') ``` -#### backend specific parameters -##### TSDB -* rate -* aggregates (optional) -* aggregation-granularity (optional) + +#### Parameters and Data Members + +- **address** — The address of the Frames service (`framesdb`). +
+ When running locally on the platform (for example, from a Jupyter Notebook service), set this parameter to `framesd:8081` to use the gRPC (recommended) or to `framesd:8080` to use HTTP. +
+ When connecting to the platform remotely, set this parameter to the API address of Frames platform service of the parent tenant. + You can copy this address from the **API** column of the V3IO Frames service on the **Services** platform dashboard page. + + + - **Type:** `str` + - **Requirement:** Required + +- **container** — The name of the platform data container that contains the backend data. + For example, `"bigdata"` or `"users"`. + + - **Type:** `str` + - **Requirement:** Required + +- **user** — The username of a platform user with permissions to access the backend data. + + - **Type:** `str` + - **Requirement:** Required when neither the [`token`](#client-param-token) parameter or the authentication environment variables are set. + See [User Authentication](#user-authentication). +
+ When the `user` parameter is set, the [`password`](#client-param-password) parameter must also be set to a matching user password. + +- **password** — A platform password for the user configured in the [`user`](#client-param-user) parameter. + + - **Type:** `str` + - **Requirement:** Required when the [`user`](#client-param-user) parameter is set. + See [User Authentication](#user-authentication). + +- **token** — A valid platform access key that allows access to the backend data. + To get this access key, select the user profile icon on any platform dashboard page, select **Access Tokens**, and copy an existing access key or create a new key. + + - **Type:** `str` + - **Requirement:** Required when neither the [`user`](#client-param-user) or [`password`](#client-param-password) parameters or the authentication environment variables are set. + See [User Authentication](#user-authentication). + + +#### Return Value + +Returns a new Frames `Client` data object. + + +#### Example + +The following example, for local platform execution, creates a Frames client for accessing data in the "users" container by using the authentication credentials of the user "iguazio": + +```python +import v3io_frames as v3f +client = v3f.Client("framesd:8081", user="iguazio", password="mypass", container="users") +``` + + +### Common Client Method Parameters + +All client methods receive the following common parameters: + +- **backend** — The backend data type for the operation. + See the backend-types descriptions in the [overview](#backend-types). + + - **Type:** `str` + - **Valid Values:** `"csv"` | `"kv"` | `"stream"` | `"tsdb"` + - **Requirement:** Required + +- **table** — The relative path to the backend data — A directory in the target platform data container (as configured for the client object) that represents a TSDB or NoSQL table or a data stream. + For example, `"mytable"` or `"examples/tsdb/my_metrics"`. + + - **Type:** `str` + - **Requirement:** Required unless otherwise specified in the method-specific documentation + +Additional method-specific parameters are described for each method. + + +### create Method + +Creates a new TSDB table or a stream in a platform data container, according to the specified backend type. + +The `create` method is supported by the `tsdb` and `stream` backends, but not by the `kv` backend, because NoSQL tables in the platform don't need to be created prior to ingestion; when ingesting data into a table that doesn't exist, the table is automatically created. + +- [Syntax](#method-create-syntax) +- [Common parameters](#method-create-common-params) +- [`tsdb` backend `create` parameters](#method-create-params-tsdb) +- [`stream` backend `create` parameters](#method-create-params-stream) + + +#### Syntax + +```python +create(backend, table, attrs=None) +``` + + + + +#### Common create Parameters + +All Frames backends that support the `create` method support the following common parameters: + +- **attrs** — A dictionary of `: ` pairs for passing additional backend-specific parameters (arguments). + + - **Type:** dict + - **Requirement:** Optional + - **Default Value:** `None` + + +#### tsdb Backend create Parameters + +The following `tsdb` backend parameters are passed via the [`attrs`](#method-create-param-attrs) parameter of the `create` method: + +- **rate** — The ingestion rate TSDB's metric-samples, as `"[0-9]+/[smh]"` (where `s` = seconds, `m` = minutes, and `h` = hours); for example, `"1/s"` (one sample per minute). + The rate should be calculated according to the slowest expected ingestion rate. + + - **Type:** `str` + - **Requirement:** Required + +- **aggregates** — Default aggregates to calculate in real time during the samples ingestion, as a comma-separated list of supported aggregation functions. + + - **Type:** `str` + - **Requirement:** Optional + +- **aggregation-granularity** — Aggregation granularity; i.e., a time interval for applying the aggregation functions, if configured in the [`aggregates`](#method-create-tsdb-param-aggregates) parameter. + + - **Type:** `str` + - **Requirement:** Optional + +For detailed information about these parameters, refer to the [V3IO TSDB documentation](https://github.com/v3io/v3io-tsdb#v3io-tsdb). -For detailed info on these parameters please visit [TSDB](https://github.com/v3io/v3io-tsdb#v3io-tsdb) docs. Example: ```python -client.create('tsdb', '/mytable', attrs={'rate': '1/m'}) +client.create("tsdb", "/mytable", attrs={"rate": "1/m"}) ``` -##### Stream -* shards=1 (optional) -* retention_hours=24 (optional) + +#### stream Backend create Parameters + +The following `stream` backend parameters are passed via the [`attrs`](#method-create-param-attrs) parameter of the `create` method: + +- **shards** (Optional) (default: `1`) — `int` — The number of stream shards to create. +- **retention_hours** (Optional) (default: `24`) — `int` — The stream's retention period, in hours. + +For detailed information about these parameters, refer to the [platform streams documentation](https://www.iguazio.com/docs/concepts/latest-release/streams). -For detailed info on these parameters please visit [Stream](https://www.iguazio.com/docs/concepts/latest-release/streams) docs. Example: ```python -client.create('stream', '/mystream', attrs={'shards': '6'}) +client.create("stream", "/mystream", attrs={"shards": 6}) ``` -### Write -Writes a Dataframe into one of the supported backends. -Common write parameters: -* dfs - list of Dataframes to write -* index_cols=None (optional) - specify specific index columns, by default Dataframe's index columns will be used. -* labels=None (optional) -* max_in_message=0 (optional) -* partition_keys=None (Not yet supported) + +### write Method + +Writes data from a DataFrame to a table or stream in a platform data container, according to the specified backend type. + +- [Syntax](#method-write-syntax) +- [Common parameters](#method-write-common-params) +- [`tsdb` backend `write` parameters](#method-write-params-tsdb) +- [`kv` backend `write` parameters](#method-write-params-kv) + + +#### Syntax + + + +```python +write(backend, table, dfs, condition='', labels=None, max_in_message=0, + index_cols=None, partition_keys=None) +``` + +- When the value of the [`iterator`](#method-read-param-iterator) parameter is `False` (default) — returns a single DataFrame. +- When the value of the `iterator` parameter is `True` — returns a + DataFrames iterator. + The returned DataFrames include a `"labels"` DataFrame attribute with backend-specific data, if applicable; for example, for the `stream` backend, this attribute holds the sequence number of the last stream record that was read. + + +#### Common write Parameters + +All Frames backends that support the `write` method support the following common parameters: + +- **dfs** (Required) — A single DataFrame, a list of DataFrames, or a DataFrames iterator — One or more DataFrames containing the data to write. +- **index_cols** (Optional) (default: `None`) — `[]str` — A list of column (attribute) names to be used as index columns for the write operation, regardless of any index-column definitions in the DataFrame. + By default, the DataFrame's index columns are used. +
+ > **Note:** The significance and supported number of index columns is backend specific. + > For example, the `kv` backend supports only a single index column for the primary-key item attribute, while the `tsdb` backend supports additional index columns for metric labels. +- **labels** (Optional) (default: `None`) — This parameter is currently defined for all backends but is used only for the TSDB backend, therefore it's documented as part of the `write` method's [`tsdb` backend parameters](#method-write-params-tsdb). +- **max_in_message** (Optional) (default: `0`) +- **partition_keys** (Optional) (default: `None`) — `[]str` — [**Not supported in this version**] Example: ```python -data = [['tom', 10], ['nick', 15], ['juli', 14]] -df = pd.DataFrame(data, columns = ['name', 'age']) -df.set_index('name') -client.write(backend='kv', table='mytable', dfs=df) +data = [["tom", 10], ["nick", 15], ["juli", 14]] +df = pd.DataFrame(data, columns = ["name", "age"]) +df.set_index("name") +client.write(backend="kv", table="mytable", dfs=df) ``` -#### backend specific parameters -##### KV + +#### tsdb Backend write Parameters + +- **labels** (Optional) (default: `None`) — `dict` — A dictionary of `
`) or by setting the `table` parameter of the `read` method to the table path. + > When the `query` string specifies the target table, the value of the `table` parameter (if set) is ignored. +- **group_by** (Optional) — `str` — A group-by query string. +
+ This parameter can't be used concurrently with the `query` parameter. +- **multi_index** (Optional) — `bool` — `True` to receive the read results as multi-index DataFrames where the labels are used as index columns in addition to the metric sample-time primary-key attribute; `False` (default) only the timestamp will function as the index. + + +For detailed information about these parameters, refer to the [V3IO TSDB documentation](https://github.com/v3io/v3io-tsdb#v3io-tsdb). + Example: ```python -df = client.read(backend='tsdb', query="select avg(cpu) as cpu, avg(diskio), avg(network)from mytable", start='now-1d', end='now', step='2h') +df = client.read(backend="tsdb", query="select avg(cpu) as cpu, avg(diskio), avg(network)from mytable", start="now-1d", end="now", step="2h") ``` -##### KV -* reset_index: bool - Reset the index. When set to `false` (default), the dataframe will have the key column of the v3io kv as the index column. -When set to `true`, the index will be reset to a range index. -* max_in_message: int - Maximal number of rows per message -* sharding_keys: []string (Experimental)- list of specific sharding keys to query. For range scan formatted tables only. -* segments: []int64 (Not yet supported) -* total_segments: int64 (Not yet supported) -* sort_key_range_start: string (Not yet supported) -* sort_key_range_end: string (Not yet supported) + +#### kv Backend read Parameters + +- **reset_index** — `bool` — Reset the index. When set to `false` (default), the DataFrame will have the key column of the v3io kv as the index column. + When set to `true`, the index will be reset to a range index. +- **max_in_message** — `int` — The maximum number of rows per message. +- **sharding_keys** — `[]string` (**Experimental**) — A list of specific sharding keys to query, for range-scan formatted tables only. +- **segments** — `[]int64` [**Not supported in this version**] +- **total_segments** — `int64` [**Not supported in this version**] +- **sort_key_range_start** — `str` [**Not supported in this version**] +- **sort_key_range_end** — `str` [**Not supported in this version**] -For detailed info on these parameters please visit KV docs. +For detailed information about these parameters, refer to the platform's NoSQL documentation. Example: ```python -df = client.read(backend='kv', table='mytable', filter='col1>666') +df = client.read(backend="kv", table="mytable", filter="col1>666") ``` -##### Stream -* seek: string - excepted values: time | seq/sequence | latest | earliest. -if `seq` seek type is requested, need to provide the desired sequence id via `sequence` parameter. -if `time` seek type is requested, need to provide the desired start time via `start` parameter. -* shard_id: string -* sequence: int64 (optional) + +#### stream Backend read Parameters -For detailed info on these parameters please visit [Stream](https://www.iguazio.com/docs/concepts/latest-release/streams) docs. +- **seek** — `str` — Valid values: `"time" | "seq"/"sequence" | "latest" | "earliest"`. +
+ If the `"seq"|"sequence"` seek type is set, you need to provide the desired record sequence ID via the [`sequence`](#method-read-stream-param-sequence) parameter. +
+ If the `time` seek type is set, you need to provide the desired start time via the `start` parameter. +- **shard_id** — `str` +- **sequence** — `int64` (Optional) + +For detailed information about these parameters, refer to the [platform streams documentation](https://www.iguazio.com/docs/concepts/latest-release/streams). Example: ```python -df = client.read(backend='stream', table='mytable', seek='latest', shard_id='5') +df = client.read(backend="stream", table="mytable", seek="latest", shard_id="5") ``` -### Delete -Deletes a table of a specific backend. + +#### Return Value + +- When the value of the [`iterator`](#method-read-param-iterator) parameter is `False` (default) — returns a single DataFrame. +- When the value of the `iterator` parameter is `True` — returns a + DataFrames iterator. + +> **Note:** The returned DataFrames include a `labels` DataFrame attribute with backend-specific data, if applicable. +> For example, for the `stream` backend, this attribute holds the sequence number of the last stream record that was read. + + + +### delete Method + +Deletes a table or stream or specific table items from a platform data container, according to the specified backend type. + +- [Syntax](#method-delete-syntax) +- [`tsdb` backend `delete` parameters](#method-delete-params-tsdb) +- [`kv` backend `delete` parameters](#method-delete-params-kv) + + +#### Syntax -Example: ```python -df = client.delete(backend='', table='mytable') +delete(backend, table, filter='', start='', end='') ``` -#### backend specific parameters -##### TSDB -* start: string - delete since start -* end: string - delete since start + +#### tsdb Backend delete Parameters + +- **start** — `str` — Start (minimum) time for the delete operation, as a string containing an RFC 3339 time, a Unix timestamp in milliseconds, a relative time of the format `"now"` or `"now-[0-9]+[mhd]"` (where `m` = minutes, `h` = hours, and `'d'` = days), or 0 for the earliest time. + For example: `"2016-01-02T15:34:26Z"`; `"1451748866"`; `"now-90m"`; `"0"`. +
+ The default start time is ` - 1h`. +- **end** — `str` — End (maximum) time for the delete operation, as a string containing an RFC 3339 time, a Unix timestamp in milliseconds, a relative time of the format `"now"` or `"now-[0-9]+[mhd]"` (where `m` = minutes, `h` = hours, and `'d'` = days), or 0 for the earliest time. + For example: `"2018-09-26T14:10:20Z"`; `"1537971006000"`; `"now-3h"`; `"now-7d"`. +
+ The default end time is `"now"`. + +> **Note:** When neither the `start` or `end` parameters are set, the entire TSDB table is deleted. + +For detailed information about these parameters, refer to the [V3IO TSDB](https://github.com/v3io/v3io-tsdb#v3io-tsdb) documentation. -Note: if both `start` and `end` are not specified **all** the TSDB table will be deleted. -For detailed info on these parameters please visit [TSDB](https://github.com/v3io/v3io-tsdb#v3io-tsdb) docs. Example: ```python -df = client.delete(backend='tsdb', table='mytable', start='now-1d', end='now-5h') +df = client.delete(backend="tsdb", table="mytable", start="now-1d", end="now-5h") ``` -##### KV -* filter: string - Filter for selective delete + + +#### kv Backend delete Parameters + +- **filter** — `str` — A platform filter expression that identifies specific items to delete. + For detailed information about platform filter expressions, see the [platform documentation](https://www.iguazio.com/docs/reference/latest-release/expressions/condition-expression/#filter-expression). + +> **Note:** When the `filter` parameter isn't set, the entire table is deleted. Example: ```python -df = client.delete(backend='kv', table='mytable', filter='age>40') +df = client.delete(backend="kv", table="mytable", filter="age > 40") ``` -### Execute -Provides additional functions that are not covered in the basic CRUD functionality. + +### execute Method + +Extends the basic CRUD functionality of the other client methods via backend-specific commands. -##### TSDB -Currently no `execute` commands are available for the TSDB backend. +- [Syntax](#method-execute-syntax) +- [Common parameters](#method-execute-common-params) +- [tsdb backend commands](#method-execute-tsdb-cmds) +- [kv backend commands](#method-execute-kv-cmds) +- [stream backend commands](#method-execute-stream-cmds) + + +#### Syntax + +```python +execute(backend, table, command='', args=None) +``` + + +#### Common execute Parameters + +All Frames backends that support the `execute` method support the following common parameters: + +- **args** — A dictionary of `: ` pairs for passing command-specific parameters (arguments). + + - **Type:** dict + - **Requirement:** Optional + - **Default Value:** `None` + + +### tsdb Backend execute Commands + +Currently, no `execute` commands are available for the `tsdb` backend. + + +### kv Backend execute Commands + +- **infer | inferschema** — Infers the data schema of a given NoSQL table and creates a schema file for the table. + + Example: + ```python + client.execute(backend="kv", table="mytable", command="infer") + ```` -##### KV -* infer, inferschema - inferring and creating a schema file for a given kv table. - Example: `client.execute(backend='kv', table='mytable', command='infer')` -##### Stream -* put - putting a new object to a stream. -Example: `client.execute(backend='stream', table='mystream', command='put', args={'data': 'this a record', 'clientinfo': 'some_info', 'partition': 'partition_key'})` + +### stream Backend execute Commands + +- **put** — Adds records to a stream. + + Example: + ```python + client.execute(backend="stream", table="mystream", command="put", args={"data": "this a record", "clientinfo": "some_info", "partition": "partition_key"}) + ``` + ## Contributing +To contribute to V3IO Frames, you need to be aware of the following: + +- [Components](#components) +- [Development](#development) + - [Adding and Changing Dependencies](#adding-and-changing-dependencies) + - [Travis CI](#travis-ci) +- [Docker Image](#docker-image) + - [Building the Image](#building-the-image) + - [Running the Image](#running-the-image) + + ### Components +The following components are required for building Frames code: + - Go server with support for both the gRPC and HTTP protocols - Go client - Python client + ### Development The core is written in [Go](https://golang.org/). The development is done on the `development` branch and then released to the `master` branch. +Before submitting changes, test the code: + - To execute the Go tests, run `make test`. - To execute the Python tests, run `make test-python`. -#### Adding/Changing Dependencies + +#### Adding and Changing Dependencies - If you add Go dependencies, run `make update-go-deps`. - If you add Python dependencies, update **clients/py/Pipfile** and run `make update-py-deps`. + #### Travis CI Integration tests are run on [Travis CI](https://travis-ci.org/). @@ -227,13 +633,13 @@ See **.travis.yml** for details. The following environment variables are defined in the [Travis settings](https://travis-ci.org/v3io/frames/settings): - Docker Container Registry ([Quay.io](https://quay.io/)) - - `DOCKER_PASSWORD` — Password for pushing images to Quay.io. - - `DOCKER_USERNAME` — Username for pushing images to Quay.io. + - `DOCKER_PASSWORD` — a password for pushing images to Quay.io. + - `DOCKER_USERNAME` — a username for pushing images to Quay.io. - Python Package Index ([PyPI](https://pypi.org/)) - - `V3IO_PYPI_PASSWORD` — Password for pushing a new release to PyPi. - - `V3IO_PYPI_USER` — Username for pushing a new release to PyPi. + - `V3IO_PYPI_PASSWORD` — a password for pushing a new release to PyPi. + - `V3IO_PYPI_USER` — a username for pushing a new release to PyPi. - Iguazio Data Science Platform - - `V3IO_SESSION` — A JSON encoded map with session information for running tests. + - `V3IO_SESSION` — a JSON encoded map with session information for running tests. For example: ``` @@ -241,8 +647,10 @@ The following environment variables are defined in the [Travis settings](https:/ ``` > **Note:** Make sure to embed the JSON object within single quotes (`'{...}'`). + ### Docker Image + #### Building the Image Use the following command to build the Docker image: @@ -251,6 +659,7 @@ Use the following command to build the Docker image: make build-docker ``` + #### Running the Image Use the following command to run the Docker image: @@ -261,6 +670,7 @@ docker run \ quay.io/v3io/frames:unstable ``` + ## LICENSE [Apache 2](LICENSE) diff --git a/clients/py/v3io_frames/__init__.py b/clients/py/v3io_frames/__init__.py index c394d710..e1ed95c8 100644 --- a/clients/py/v3io_frames/__init__.py +++ b/clients/py/v3io_frames/__init__.py @@ -35,31 +35,40 @@ def Client(address='', data_url='', container='', path='', user='', password='', token='', session_id='', frame_factory=pd.DataFrame, concat=pd.concat): - """Return a new client. + """Creates a new Frames client object Parameters ---------- address : str - framesd backend address. Use grpc:// or http:// prefix to specify - protocol (default is gRPC) + Address of the Frames service (framesd). Use the grpc:// prefix for + gRPC (default; recommended) or the http:// prefix for HTTP. + Use `framesd:8081` (gRPC; recommended) or `framesd:8080` for local + execution on an Iguazio Data Science Platform ("the platform"). data_url : str - Backend URL (session info) + Base URL for accessing the backend data container : str Container name (session info) path : str - Path in container (session info) + DEPRECATED user : str - Login user (session info) + The username of a platform user with permissions to access the backend + data; can't be used with `token` password : str - Login password (session info) + A platform password for the user configured in the `user` parameter; + required when `user` is set; can't be used with `token` token : str - Login token (session info) + A valid platform access key that allows access to the backend data; + can't be used with `user` and `password` session_id : str - Session ID (session info) + Session ID frame_factory : class - DataFrame factory + DataFrame factory; currently, pandas DataFrame (default) concat : function - Function to concat DataFrames + Function for concatenating DataFrames; default: pandas concat + + Return Value + ---------- + A new `Client` object """ protocol = urlparse(address).scheme or 'grpc' if protocol not in _known_protocols: diff --git a/clients/py/v3io_frames/client.py b/clients/py/v3io_frames/client.py index c73f58d1..37ae8d90 100644 --- a/clients/py/v3io_frames/client.py +++ b/clients/py/v3io_frames/client.py @@ -25,18 +25,22 @@ class ClientBase: def __init__(self, address, session, frame_factory=pd.DataFrame, concat=pd.concat): - """Create new client + """Creates a new Frames client object Parameters ---------- address : str - framesd server address + Address of the Frames service (framesd) session : Session Session object frame_factory : class - DataFrame factory (currencly pandas and cudf supported) + DataFrame factory; currently, pandas and cuDF are supported concat : function - Function to concat DataFrames + Function for concatenating DataFrames; default: pandas concat + + Return Value + ---------- + A new `Client` object """ address = address or environ.get('V3IO_FRAMESD') if not address: @@ -49,40 +53,46 @@ def __init__(self, address, session, frame_factory=pd.DataFrame, def read(self, backend='', table='', query='', columns=None, filter='', group_by='', limit=0, data_format='', row_layout=False, max_in_message=0, marker='', iterator=False, **kw): - """Run a query + """Reads data from a table or stream (run a data query) Common Parameters ---------- backend : str Backend name ('kv', 'tsdb', 'stream') table : str - Table to query (can't be used with query) + Table to query; ignored when `query` references a specific table query : str - Query in SQL format + Query string, in SQL format columns : []str - List of columns to pass (can't be used with query) + List of item attributes (columns) to return; + can't be used with `query` filter : str - Query filter (can't be used with query) + Query filter; can't be used with `query` group_by : str - Query group by (can't be used with query) + A group-by query string; can't be used with `query` limit: int - Maximal number of rows to return + Maximum number of rows to return data_format : str Data format row_layout : bool - Weather to use row layout (vs the default column layout) + True to use a row layout; False (default) to use a column layout + [Not supported in this version] max_in_message : int - Maximal number of rows per message + Maximum number of rows per message marker : str - Query marker (can't be used with query) + Query marker; can't be used with the `query` parameter iterator : bool - Return iterator of DataFrames or (if False) just one DataFrame + True - return a DataFrames iterator; + False (default) - return a single DataFrame **kw Extra parameter for specific backends - Returns: - A pandas DataFrame iterator. Each DataFrame will have "labels" - attribute. If `iterator` is False will return a single DataFrame. + Return Value + ---------- + - When `iterator` is False (default) - returns a single DataFrame. + - When `iterator` is True - returns a DataFrames iterator. + The returned DataFrames include a "labels" DataFrame attribute with + backend-specific data, if applicable. """ if not backend: raise ReadError('no backend') @@ -101,7 +111,7 @@ def read(self, backend='', table='', query='', columns=None, filter='', def write(self, backend, table, dfs, expression='', condition='', labels=None, max_in_message=0, index_cols=None, partition_keys=None): - """Write to table + """Writes data to a table or stream Parameters ---------- @@ -109,21 +119,26 @@ def write(self, backend, table, dfs, expression='', condition='', Backend name table : str Table to write to - dfs : iterable of DataFrame or a single data frame - Frames to write + dfs : a single DataFrame, a DataFrames list, or a DataFrames iterator + DataFrames to write expression : str - Write expression + A platform update expression that determines the update logic for + all items in the DataFrame [Not supported in this version] condition : str - Write condition + A platform condition expression that defines a condition for + performing the write operation labels : dict - Set of lables + Dictionary of labels; currently, used only with the "tsdb" backend max_in_message : int - Maximal number of rows to send per message - index_cols : list of str - Columns to use as indices - partition_keys : list of str - Partition keys - Returns: + Maximum number of rows to send per message + index_cols : []str + List of column names to be used as the index columns for the write + operation; by default, the DataFrame's index columns are used + partition_keys : []str + Partition keys [Not supported in this version] + + Return Value + ---------- Write result """ self._validate_request(backend, table, WriteError) @@ -138,7 +153,7 @@ def write(self, backend, table, dfs, expression='', condition='', return self._write(request, dfs, labels, index_cols) def create(self, backend, table, attrs=None, schema=None, if_exists=FAIL): - """Create a table + """Creates a new TSDB table or a stream Parameters ---------- @@ -147,7 +162,7 @@ def create(self, backend, table, attrs=None, schema=None, if_exists=FAIL): table : str Table to create attrs : dict - Table attributes + A dictionary of backend-specific parameters (arguments) schema: Schema or None Table schema if_exists : int @@ -163,7 +178,7 @@ def create(self, backend, table, attrs=None, schema=None, if_exists=FAIL): def delete(self, backend, table, filter='', start='', end='', if_missing=FAIL): - """Delete a table + """Deletes a table or stream or specific table items Parameters ---------- @@ -174,9 +189,17 @@ def delete(self, backend, table, filter='', start='', end='', filter : str Filter for selective delete start : string - Delete since start (TSDB/Stream) + (`tsdb` backend only) Start (minimum) metric-sample time for the + delete operation, as a string containing an RFC 3339 time, a Unix + timestamp in milliseconds, a relative time (`"now"` or + `"now-[0-9]+[mhd]"`, where `m` = minutes, `h` = hours, and `'d'` = + days), or 0 for the earliest time; the default is - 1h end : string - Delete up to end (TSDB/Stream) + (`tsdb` backend only) End (maximum) metric-sample time for the + delete operation, as a string containing an RFC 3339 time, a Unix + timestamp in milliseconds, a relative time (`"now"` or + `"now-[0-9]+[mhd]"`, where `m` = minutes, `h` = hours, and `'d'` = + days), or 0 for the earliest time; the default is "now" if_missing : int One of IGNORE or FAIL @@ -189,7 +212,7 @@ def delete(self, backend, table, filter='', start='', end='', return self._delete(backend, table, filter, start, end, if_missing) def execute(self, backend, table, command='', args=None, expression=''): - """Execute a command + """Executes a backend-specific command on a table or stream Parameters ---------- @@ -200,7 +223,7 @@ def execute(self, backend, table, command='', args=None, expression=''): command : str Command to execute args : dict - Command arguments + A dictionary of command-specific parameters (arguments) expression : str Command expression