From a2e4cd141e7af19bc65a3b6bdfe6b794ab6cbdb7 Mon Sep 17 00:00:00 2001
From: Andrea Ponti <59694427+andreaponti5@users.noreply.github.com>
Date: Tue, 17 Sep 2024 14:02:13 +0200
Subject: [PATCH] Add Document Models (#1)

* Add base models

* Add unit tests

* Add setup.py and requirements-dev.txt

* Update README.md

* Add basic dev container configuration

---------

Co-authored-by: Alessio Vertemati <alessio.vertemati@gmail.com>
---
 .devcontainer/devcontainer.json             |  16 ++
 .github/CONTRIBUTING.md                     |  54 +++++
 .github/SECURITY.md                         |   3 +
 LICENSE                                     |   2 +-
 README.md                                   | 210 ++++++++++++++++-
 parse_document_model/__init__.py            |   1 +
 parse_document_model/attributes.py          |  27 +++
 parse_document_model/document.py            |  89 +++++++
 parse_document_model/marks.py               |  47 ++++
 requirements-dev.txt                        |   4 +
 requirements.txt                            |   1 +
 setup.py                                    |  33 +++
 {document-model-python => test}/__init__.py |   0
 test/data/extract-text-1.json               | 248 ++++++++++++++++++++
 test/data/extract-text-2.json               |  37 +++
 test/data/extract-text-empty.json           |  12 +
 test/test_validation.py                     |  68 ++++++
 17 files changed, 850 insertions(+), 2 deletions(-)
 create mode 100644 .devcontainer/devcontainer.json
 create mode 100644 .github/CONTRIBUTING.md
 create mode 100644 .github/SECURITY.md
 create mode 100644 parse_document_model/__init__.py
 create mode 100644 parse_document_model/attributes.py
 create mode 100644 parse_document_model/document.py
 create mode 100644 parse_document_model/marks.py
 create mode 100644 requirements-dev.txt
 create mode 100644 requirements.txt
 create mode 100644 setup.py
 rename {document-model-python => test}/__init__.py (100%)
 create mode 100644 test/data/extract-text-1.json
 create mode 100644 test/data/extract-text-2.json
 create mode 100644 test/data/extract-text-empty.json
 create mode 100644 test/test_validation.py

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000..fcd29af
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,16 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/python
+{
+	"name": "Python 3",
+	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+	"image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye",
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+
+	// Use 'postCreateCommand' to run commands after the container is created.
+	"postCreateCommand": "pip3 install --user -r requirements.txt -r requirements-dev.txt"
+
+	// Configure tool-specific properties.
+	// "customizations": {}
+}
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
new file mode 100644
index 0000000..b8f4112
--- /dev/null
+++ b/.github/CONTRIBUTING.md
@@ -0,0 +1,54 @@
+# Contributing
+
+Contributions are **welcome** and will be fully **credited**.
+
+Please read and understand the contribution guide before creating an issue or pull request.
+
+## Etiquette
+
+This project is open source, and as such, the maintainers give their free time to build and maintain the source code held within. They make the code freely available in the hope that it will be of use to other developers. It would be extremely unfair for them to suffer abuse or anger for their hard work.
+
+Please be considerate towards maintainers when raising issues or presenting pull requests. Let's show the
+world that developers are civilized and selfless people.
+
+It's the duty of the maintainer to ensure that all submissions to the project are of sufficient
+quality to benefit the project. Many developers have different skillsets, strengths, and weaknesses. Respect the maintainer's decision, and do not be upset or abusive if your submission is not used.
+
+## Viability
+
+When requesting or submitting new features, first consider whether it might be useful to others. Open
+source projects are used by many developers, who may have entirely different needs to your own. Think about
+whether or not your feature is likely to be used by other users of the project.
+
+## Procedure
+
+> [!NOTE]
+> Issue tracking is not currently enabled for this repository. We are organising it.
+
+Before filing an issue:
+
+- Attempt to replicate the problem, to ensure that it wasn't a coincidental incident.
+- Check to make sure your feature suggestion isn't already present within the project.
+- Check the pull requests tab to ensure that the bug doesn't have a fix in progress.
+- Check the pull requests tab to ensure that the feature isn't already in progress.
+
+Before submitting a pull request:
+
+- Check the codebase to ensure that your feature doesn't already exist.
+- Check the pull requests to ensure that another person hasn't already submitted the feature or fix.
+
+## Requirements
+
+If the project maintainer has any additional requirements, you will find them listed here.
+
+- **Add tests!** - Your patch won't be accepted if it doesn't have tests.
+
+- **Document any change in behaviour** - Make sure the `README.md` and any other relevant documentation are kept up-to-date.
+
+- **Consider our release cycle** - We try to follow [SemVer v2.0.0](https://semver.org/). Randomly breaking public APIs is not an option.
+
+- **One pull request per feature** - If you want to do more than one thing, send multiple pull requests.
+
+- **Send coherent history** - Make sure each individual commit in your pull request is meaningful. If you had to make multiple intermediate commits while developing, please [squash them](https://www.git-scm.com/book/en/v2/Git-Tools-Rewriting-History#Changing-Multiple-Commit-Messages) before submitting.
+
+**Happy coding**!
\ No newline at end of file
diff --git a/.github/SECURITY.md b/.github/SECURITY.md
new file mode 100644
index 0000000..8e032a3
--- /dev/null
+++ b/.github/SECURITY.md
@@ -0,0 +1,3 @@
+# Security Policy
+
+If you discover any security related issues, please email security@oneofftech.xyz instead of using the discussions or the issue tracker.
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index ff1f271..d7f078f 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2024 Andrea Ponti
+Copyright (c) OneOffTech <info@oneofftech.xyz>
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index b363d42..d02a276 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,209 @@
-# :card_box: Document Model Python
+![pypi](https://img.shields.io/pypi/v/parse-document-model-python.svg)
+[![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://docs.pydantic.dev/latest/contributing/#badges)
+[![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
+
+# Parse Document Model (Python)
+
+**Parse Document Model** (Python) provides Pydantic models for representing text documents using a hierarchical model. 
+This library allows you to define documents as a hierarchy of (specialised) nodes where each node can represent a document, page, text, heading, body, and more.
+
+These models aim to preserve the underlying structure of text documents for further processing, such as creating a table of contents or transforming between formats, e.g. converting a parsed PDF to Markdown.
+
+- **Hierarchical structure**: The document is modelled as a hierarchy of nodes. Each node can represent a part of the 
+document itself, pages, text.
+- **Rich text support**: Nodes can represent not only the content but also the formatting (e.g. bold, italic) applied to the text. 
+- **Attributes**: Each node can have attributes that provide additional information such as page number, 
+bounding box, etc.
+- **Built-in validation and types**: Built with [`Pydantic`](https://docs.pydantic.dev/latest/), ensuring type safety, validation and effortless creation of complex document structures.
+
+
+**Requirements**
+
+- Python 3.12 or above (Python 3.9, 3.10 and 3.11 are supported on best-effort).
+
+
+**Next steps**
+
+- [Explore the document model](#document-model-overview)
+- [Install the library and use the models](#getting-started)
+
+
+## Document Model Overview
+
+We want to represent the document structure using a hierarchy so that the inherited structure is preserved when chapters, sections and headings are used. Consider a generic document with two pages, one heading per page and one paragraph of text. The resulting representation might be the following.
+
+```
+Document
+ ├─Page
+ │  ├─Text (category: heading)
+ │  └─Text (category: body)
+ └─Page
+    ├─Text (category: heading)
+    └─Text (category: body)
+```
+
+At a glance you can see the structure, the document is composed of two pages and there are two headings. To do so we defined a hierarchy around the concept of a Node, like a node in a graph.
+
+### Node types
+
+```mermaid
+classDiagram
+    class Node
+    Node <|-- StructuredNode
+    Node <|-- Text
+    StructuredNode <|-- Document
+    StructuredNode <|-- Page
+```
+
+
+#### 1. **Node** (Base Class)
+
+This is the abstract class from which all other nodes inherit. 
+
+Each node has:
+
+- `category`: The type of the node (e.g., `doc`, `page`, `heading`).
+- `attributes`: Optional field to attach extra data to a node. See [Attributes](#attributes).
+
+#### 2. **StructuredNode**
+
+This extends the [`Node`](#1-node-base-class). It is used to represent the hierarchy as a node whose content is a list of other nodes, such as like [`Document`](#3-document) and [`Page`](#4-page).
+
+- `content`: List of `Node`.
+
+
+#### 3. **Document**
+
+This is the root node of a document.
+
+- `category`: Always set to `"doc"`.
+- `attributes`: Document-wide attributes can be set here.
+- `content`: List of [`Page`](#4-page) nodes that form the document.
+
+#### 4. **Page**
+
+Represents a page in the document:
+
+- `category`: Always set to `"page"`.
+- `attributes`: Can contain metadata like page number.
+- `content`: List of [`Text`](#5-text) nodes on the page.
+
+#### 5. **Text**
+
+This node represent a paragraph, a heading or any text within the document.
+
+- `category`: The type `"doc"`.
+- `content`: A string representing the textual content.
+- `marks`: List of [marks](#marks) applied to the text, such as bold, italic, etc.
+- `attributes`: Can contain metadata like the bounding box representing where this portion of text is located in the page.
+
+
+
+### Marks
+
+Marks are used to add style or functionality to the text within a [`Text`](#5-text) node. 
+For example, bold text, italic text, links and custom styles such as font or colour.
+
+**Mark Types**
+
+- `Bold`: Represents bold text.
+- `Italic`: Represents italic text.
+- `TextStyle`: Allows customization of font and color.
+- `Link`: Represents a hyperlink.
+
+Marks are validated and enforced with the help of `Pydantic` model validators.
+
+### Attributes
+
+Attributes are optional fields that can store additional information for each node. Some predefined attributes are:
+
+- `DocumentAttributes`: General attributes for the document (currently reserved for the future).
+- `PageAttributes`: Specific page related attributes, such as the page number.
+- `TextAttributes`: Text related attributes, such as bounding boxes.
+- `BoundingBox`: A box that specifies the position of a text in the page.
+
+
+## Getting started
+
+### Installation
+
+Parse Document Model is distributed with PyPI. You can install it with `pip`.
+
+```bash
+pip install parse-document-model-python
+```
+
+### Quick Example
+
+Here’s how you can represent a simple document with one page and some text:
+
+```python
+from document_model_python.document import Document, Page, Text
+
+doc = Document(
+    category="doc",
+    content=[
+        Page(
+            category="page",
+            content=[
+                Text(
+                    category="heading",
+                    content="Welcome to parse-document-model-python",
+                    marks=["bold"]
+                ),
+                Text(
+                    category="body",
+                    content="This is an example text using the document model."
+                )
+            ]
+        )
+    ]
+)
+```
+
+## Testing
+
+Parse Document Model is tested using [pytest](https://docs.pytest.org/en/stable/). Tests run for each commit and pull request.
+
+Install the dependencies.
+
+```bash
+pip install -r requirements.txt -r requirements-dev.txt
+```
+
+Execute the test suite.
+
+```bash
+pytest
+```
+
+
+## Contributing
+
+Thank you for considering contributing to the Parse Document Model! The contribution guide can be found in the [CONTRIBUTING.md](./.github/CONTRIBUTING.md) file.
+
+> [NOTE]
+> Consider opening a [discussion](https://github.com/OneOffTech/parse-document-model-python/discussions) before submitting a pull request with changes to the model structures.
+
+## Security Vulnerabilities
+
+Please review [our security policy](./.github/SECURITY.md) on how to report security vulnerabilities.
+
+## Credits
+
+- [OneOffTech](https://github.com/OneOffTech)
+- [All Contributors](../../contributors)
+
+## Supporters
+
+The project is provided and supported by [OneOff-Tech (UG)](https://oneofftech.de).
+
+<p align="left"><a href="https://oneofftech.de" target="_blank"><img src="https://raw.githubusercontent.com/OneOffTech/.github/main/art/oneofftech-logo.svg" width="200"></a></p>
+
+## Aknowledgements
+
+The format and structure takes inspiration from [ProseMirror](https://prosemirror.net/docs/ref/#model.Document_Schema).
+
+## License
+
+The MIT License (MIT). Please see [License File](LICENSE.md) for more information.
diff --git a/parse_document_model/__init__.py b/parse_document_model/__init__.py
new file mode 100644
index 0000000..095cb1f
--- /dev/null
+++ b/parse_document_model/__init__.py
@@ -0,0 +1 @@
+from .document import Document, Page
diff --git a/parse_document_model/attributes.py b/parse_document_model/attributes.py
new file mode 100644
index 0000000..79244b7
--- /dev/null
+++ b/parse_document_model/attributes.py
@@ -0,0 +1,27 @@
+from abc import ABC
+
+from pydantic import BaseModel
+
+
+class BoundingBox(BaseModel):
+    min_x: float
+    min_y: float
+    max_x: float
+    max_y: float
+    page: int
+
+
+class Attributes(BaseModel, ABC):
+    pass
+
+
+class DocumentAttributes(Attributes):
+    pass
+
+
+class PageAttributes(Attributes):
+    page: int
+
+
+class TextAttributes(Attributes):
+    bounding_box: list[BoundingBox] = []
diff --git a/parse_document_model/document.py b/parse_document_model/document.py
new file mode 100644
index 0000000..6f1e9db
--- /dev/null
+++ b/parse_document_model/document.py
@@ -0,0 +1,89 @@
+import warnings
+from abc import ABC
+from typing import List, Optional, Union
+
+from pydantic import BaseModel, Field, model_validator
+
+from parse_document_model.attributes import Attributes, PageAttributes, TextAttributes, DocumentAttributes
+from parse_document_model.marks import Mark, TextStyleMark, UrlMark
+
+
+class Node(BaseModel, ABC):
+    """Base element of a document.
+
+    A document is a hierarchy of nodes.
+    Nodes could represent: document, pages, headings, etc.
+    """
+    category: str = Field(
+        ...,
+        title="Node Type",
+        description="The type of node. Examples are: `doc`, `page`, `heading`, `body`, etc. For an exhaustive list "
+                    "refers to the documentation.",
+    )
+    attributes: Optional[Attributes] = Field(
+        default=None,
+        title="Node Attributes",
+        description="Attributes related to the node. An example is the reference page."
+    )
+
+
+class StructuredNode(Node):
+    content: List[Node] = Field(
+        ...,
+        title="Node Content",
+        description="The content of the node. If it is a leaf node this is text, otherwise it could be a list of "
+                    "nodes.",
+    )
+
+
+class Text(Node):
+    """The leaf node of a document.
+
+    That's where the actual text is.
+
+    """
+    attributes: Optional[TextAttributes] = TextAttributes()
+    content: str = Field(
+        ...,
+        title="Content",
+        description="The new field to hold the text content."
+    )
+    marks: list[Union[Mark, TextStyleMark, UrlMark]] = []
+    text: Optional[str] = Field(
+        None,
+        title="Text",
+        description="(Deprecated) This field is deprecated and will be removed in a future version. "
+                    "Use `content` instead."
+    )
+    role: Optional[str] = Field(
+        None,
+        title="Node Type",
+        description="(Deprecated) This field is deprecated and will be removed in a future version. "
+                    "Use `category` instead."
+    )
+
+    @model_validator(mode="before")
+    def handle_deprecations(self):
+        if "text" in self and "content" not in self:
+            warnings.warn("The use of `text` is deprecated and will be removed in a future version. "
+                          "Use `content` instead.", DeprecationWarning)
+            self["content"] = self["text"]
+        if "role" in self and "category" not in self:
+            warnings.warn("The use of `role` is deprecated and will be removed in a future version. "
+                          "Use `category` instead.", DeprecationWarning)
+            self["category"] = self["role"]
+        return self
+
+
+class Page(StructuredNode):
+    """The node that represents a document's page."""
+    category: str = "page"
+    attributes: Optional[PageAttributes] = None
+    content: list[Text]
+
+
+class Document(StructuredNode):
+    """The root node of a document."""
+    category: str = "doc"
+    attributes: Optional[DocumentAttributes] = None
+    content: list[Page]
diff --git a/parse_document_model/marks.py b/parse_document_model/marks.py
new file mode 100644
index 0000000..1662c56
--- /dev/null
+++ b/parse_document_model/marks.py
@@ -0,0 +1,47 @@
+from typing import Any
+from typing import Literal, Optional
+
+from pydantic import BaseModel, model_validator
+
+
+class Color(BaseModel):
+    id: str
+    r: int
+    g: int
+    b: int
+
+
+class Font(BaseModel):
+    id: str
+    name: str
+    size: int
+
+
+class Mark(BaseModel):
+    category: Literal['bold', 'italic', 'textStyle', 'link']
+
+    @model_validator(mode='before')
+    def check_details(self: Any) -> Any:
+        mark_type = self.get('category')
+
+        if mark_type == 'textStyle':
+            if 'color' not in self and 'font' not in self:
+                raise ValueError('color or font must be provided when type is textStyle')
+            if 'url' in self:
+                raise ValueError('url should not be provided when type is textStyle')
+
+        elif mark_type == 'link':
+            if 'url' not in self:
+                raise ValueError('url must be provided when type is link')
+            if 'textStyle' in self:
+                raise ValueError('textStyle should not be provided when type is link')
+        return self
+
+
+class TextStyleMark(Mark):
+    color: Optional[Color] = None
+    font: Optional[Font] = None
+
+
+class UrlMark(Mark):
+    url: str
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..7a59972
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,4 @@
+pytest~=8.3.3
+setuptools~=75.0.0
+twine~=5.1.1
+wheel~=0.44.0
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f22a600
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+pydantic~=2.9.1
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..9d0edb0
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,33 @@
+from codecs import open
+from os import path
+
+from setuptools import setup
+
+ROOT = path.abspath(path.dirname(__file__))
+
+with open(path.join(ROOT, 'README.md'), encoding='utf-8') as f:
+    long_description = f.read()
+
+setup(
+    name='document-model-python',
+    version='0.1.0',
+    description='Pydantic models for representing a text document as a hierarchical structure.',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    author='OneOffTech',
+    author_email='info@oneofftech.xyz',
+    license='MIT',
+    url='https://github.com/OneOffTech/parse-document-model-python',
+    project_urls={
+        'Source': 'https://github.com/OneOffTech/parse-document-model-python',
+    },
+    classifiers=[
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python',
+        'Operating System :: OS Independent'
+    ],
+    packages=['parse_document_model'],
+    include_package_data=True,
+    install_requires=['pydantic>=2.9.0']
+)
diff --git a/document-model-python/__init__.py b/test/__init__.py
similarity index 100%
rename from document-model-python/__init__.py
rename to test/__init__.py
diff --git a/test/data/extract-text-1.json b/test/data/extract-text-1.json
new file mode 100644
index 0000000..2d83f2f
--- /dev/null
+++ b/test/data/extract-text-1.json
@@ -0,0 +1,248 @@
+{
+  "category": "doc",
+  "content": [
+    {
+      "category": "page",
+      "attributes": {
+        "page": 1
+      },
+      "content": [
+        {
+          "role": "page-header",
+          "text": "Type of document / Offer / Contract / Report",
+          "marks": [
+            {
+              "category": "textStyle",
+              "color": {
+                "r": 78,
+                "b": 189,
+                "g": 128,
+                "id": "color-1"
+              },
+              "font": {
+                "name": "fira sans",
+                "id": "font-300",
+                "size": 18
+              }
+            }
+          ],
+          "attributes": {
+            "bounding_box": [
+              {
+                "min_x": 62.1,
+                "min_y": 565.0,
+                "max_x": 427.2,
+                "max_y": 577.6,
+                "page": 1
+              }
+            ]
+          }
+        },
+        {
+          "role": "title",
+          "text": "This is the title of the document, it",
+          "marks": [
+            {
+              "category": "textStyle",
+              "color": {
+                "r": 0,
+                "b": 0,
+                "g": 0,
+                "id": "color-0"
+              },
+              "font": {
+                "name": "fira sans",
+                "id": "font-300",
+                "size": 30
+              }
+            }
+          ],
+          "attributes": {
+            "bounding_box": [
+              {
+                "min_x": 62.1,
+                "min_y": 532.1,
+                "max_x": 514.6,
+                "max_y": 554.7,
+                "page": 1
+              }
+            ]
+          }
+        },
+        {
+          "role": "heading",
+          "text": "can use multiple lines and grow a bit",
+          "marks": [
+            {
+              "category": "textStyle",
+              "color": {
+                "r": 0,
+                "b": 0,
+                "g": 0,
+                "id": "color-0"
+              },
+              "font": {
+                "name": "fira sans",
+                "id": "font-300",
+                "size": 30
+              }
+            }
+          ],
+          "attributes": {
+            "bounding_box": [
+              {
+                "min_x": 62.1,
+                "min_y": 496.1,
+                "max_x": 503.0,
+                "max_y": 518.7,
+                "page": 1
+              },
+              {
+                "min_x": 62.1,
+                "min_y": 460.1,
+                "max_x": 98.7,
+                "max_y": 482.7,
+                "page": 1
+              }
+            ]
+          }
+        },
+        {
+          "role": "heading",
+          "text": "Subtitle of the document",
+          "marks": [
+            {
+              "category": "textStyle",
+              "color": {
+                "r": 247,
+                "b": 70,
+                "g": 150,
+                "id": "color-2"
+              },
+              "font": {
+                "name": "fira sans",
+                "id": "font-300",
+                "size": 22
+              }
+            }
+          ],
+          "attributes": {
+            "bounding_box": [
+              {
+                "min_x": 62.1,
+                "min_y": 431.6,
+                "max_x": 296.5,
+                "max_y": 447.1,
+                "page": 1
+              }
+            ]
+          }
+        },
+        {
+          "role": "body",
+          "text": "OneOff-Tech UG",
+          "marks": [
+            {
+              "category": "textStyle",
+              "color": {
+                "r": 0,
+                "b": 0,
+                "g": 0,
+                "id": "color-0"
+              },
+              "font": {
+                "name": "fira sans-bold",
+                "id": "font-301",
+                "size": 11
+              }
+            },
+            {
+              "category": "bold"
+            }
+          ],
+          "attributes": {
+            "bounding_box": [
+              {
+                "min_x": 62.1,
+                "min_y": 209.0,
+                "max_x": 253.5,
+                "max_y": 217.5,
+                "page": 1
+              }
+            ]
+          }
+        }
+      ]
+    },
+    {
+      "category": "page",
+      "attributes": {
+        "page": 2
+      },
+      "content": [
+        {
+          "role": "heading",
+          "text": "1 First chapter",
+          "marks": [
+            {
+              "category": "textStyle",
+              "color": {
+                "r": 53,
+                "b": 145,
+                "g": 96,
+                "id": "color-4"
+              },
+              "font": {
+                "name": "fira sans",
+                "id": "font-300",
+                "size": 20
+              }
+            }
+          ],
+          "attributes": {
+            "bounding_box": [
+              {
+                "min_x": 56.7,
+                "min_y": 702.8,
+                "max_x": 193.9,
+                "max_y": 717.8,
+                "page": 2
+              }
+            ]
+          }
+        },
+        {
+          "role": "body",
+          "text": "This is an example text.",
+          "marks": [
+            {
+              "category": "textStyle",
+              "color": {
+                "r": 0,
+                "b": 0,
+                "g": 0,
+                "id": "color-0"
+              },
+              "font": {
+                "name": "fira sans",
+                "id": "font-300",
+                "size": 11
+              }
+            }
+          ],
+          "attributes": {
+            "bounding_box": [
+              {
+                "min_x": 56.7,
+                "min_y": 665.0,
+                "max_x": 504.2,
+                "max_y": 687.3,
+                "page": 2
+              }
+            ]
+          }
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/test/data/extract-text-2.json b/test/data/extract-text-2.json
new file mode 100644
index 0000000..a3fcd23
--- /dev/null
+++ b/test/data/extract-text-2.json
@@ -0,0 +1,37 @@
+{
+  "category": "doc",
+  "content": [
+    {
+      "category": "page",
+      "attributes": {
+        "page": 1
+      },
+      "content": [
+        {
+          "role": "body",
+          "text": "Type of document / Offer / Contract / Report This is the title of the document, it can use multiple lines and grow a bit Subtitle of the document OneOff-Tech",
+          "marks": [],
+          "attributes": {
+            "bounding_box": []
+          }
+        }
+      ]
+    },
+    {
+      "category": "page",
+      "attributes": {
+        "page": 2
+      },
+      "content": [
+        {
+          "role": "body",
+          "text": "Section Heading 1 First chapter This is an example text.",
+          "marks": [],
+          "attributes": {
+            "bounding_box": []
+          }
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/test/data/extract-text-empty.json b/test/data/extract-text-empty.json
new file mode 100644
index 0000000..cd6a8c7
--- /dev/null
+++ b/test/data/extract-text-empty.json
@@ -0,0 +1,12 @@
+{
+  "category": "doc",
+  "content": [
+    {
+      "category": "page",
+      "attributes": {
+        "page": 1
+      },
+      "content": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/test/test_validation.py b/test/test_validation.py
new file mode 100644
index 0000000..339764f
--- /dev/null
+++ b/test/test_validation.py
@@ -0,0 +1,68 @@
+import json
+
+import pytest
+
+from parse_document_model import Document, Page
+from parse_document_model.attributes import PageAttributes, TextAttributes
+from parse_document_model.marks import Mark, TextStyleMark, UrlMark
+
+
+def test_read_from_json():
+    filepaths = ["test/data/extract-text-1.json",
+                 "test/data/extract-text-2.json",
+                 "test/data/extract-text-empty.json"]
+    for filepath in filepaths:
+        doc_json = json.load(open(filepath, "r"))
+        doc = Document(**doc_json)
+
+        # Check the Document
+        assert doc.category == "doc"
+        assert isinstance(doc.content, list)
+
+        # Check the Page
+        for page in doc.content:
+            assert isinstance(page, Page)
+            assert page.category == "page"
+            assert isinstance(page.attributes, PageAttributes)
+            assert isinstance(page.content, list)
+
+            # Check Text
+            for text in page.content:
+                assert text.category in ["page-header", "title", "heading", "body", "footer"]
+                assert isinstance(text.content, str)
+                assert isinstance(text.attributes, TextAttributes)
+                assert isinstance(text.marks, list)
+
+                # Check Marks
+                for mark in text.marks:
+                    assert isinstance(mark, Mark)
+
+
+def test_style_marks():
+    text_style_mark_json = [{"category": "textStyle", "font": {"id": "1", "name": "test-font", "size": 1}},
+                            {"category": "textStyle", "color": {"id": "1", "r": 0, "g": 0, "b": 0}},
+                            {"category": "textStyle", "font": {"id": "1", "name": "test-font", "size": 1},
+                             "color": {"id": "1", "r": 0, "g": 0, "b": 0}},
+                            {"category": "textStyle"},
+                            {"category": "textStyle", "url": "test-url"}]
+    for mark_json in text_style_mark_json:
+        if "font" in mark_json or "color" in mark_json:
+            mark = TextStyleMark(**mark_json)
+            assert isinstance(mark, TextStyleMark)
+        else:
+            with pytest.raises(ValueError):
+                TextStyleMark(**mark_json)
+
+
+def test_url_marks():
+    url_mark_json = [{"category": "link", "url": "test-url"},
+                     {"category": "link"},
+                     {"category": "link", "font": {"id": "1", "name": "test-font", "size": 1}},
+                     {"category": "link", "color": {"id": "1", "r": 0, "g": 0, "b": 0}}]
+    for mark_json in url_mark_json:
+        if "url" in mark_json:
+            mark = UrlMark(**mark_json)
+            assert isinstance(mark, UrlMark)
+        else:
+            with pytest.raises(ValueError):
+                UrlMark(**mark_json)