Skip to content

Commit

Permalink
Merge pull request #723 from IBM/docling-1.20.0
Browse files Browse the repository at this point in the history
Update Docling to 1.20.0
  • Loading branch information
touma-I authored Oct 18, 2024
2 parents c7c3bca + 6908816 commit c56f016
Show file tree
Hide file tree
Showing 19 changed files with 79 additions and 33 deletions.
2 changes: 1 addition & 1 deletion transforms/language/doc_chunk/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
data-prep-toolkit==0.2.2.dev1
docling-core==1.3.0
docling-core==1.7.2
llama-index-core>=0.11.0,<0.12.0
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "doc_chunk",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-09-18 16:05:04",
"end_time": "2024-09-18 16:05:04",
"start_time": "2024-10-18 14:05:09",
"end_time": "2024-10-18 14:05:11",
"status": "success"
},
"code": {
Expand All @@ -24,6 +24,8 @@
"output_jsonpath_column_name": "doc_jsonpath",
"output_pageno_column_name": "page_number",
"output_bbox_column_name": "bbox",
"chunk_size_tokens": 128,
"chunk_overlap_tokens": 30,
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
Expand All @@ -32,12 +34,19 @@
],
"num_processors": 0
},
"execution_stats": {
"cpus": 27.9,
"gpus": 0,
"memory": 25.75,
"object_store": 0,
"execution time, min": 0.021
},
"job_output_stats": {
"source_files": 1,
"source_size": 50276,
"result_files": 1,
"result_size": 31246,
"processing_time": 0.071,
"result_size": 31223,
"processing_time": 1.266,
"nfiles": 1,
"nrows": 88,
"source_doc_count": 1,
Expand Down
Binary file not shown.
17 changes: 13 additions & 4 deletions transforms/language/doc_chunk/ray/test-data/expected/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "doc_chunk",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-09-18 16:05:04",
"end_time": "2024-09-18 16:05:04",
"start_time": "2024-10-18 14:05:09",
"end_time": "2024-10-18 14:05:11",
"status": "success"
},
"code": {
Expand All @@ -24,6 +24,8 @@
"output_jsonpath_column_name": "doc_jsonpath",
"output_pageno_column_name": "page_number",
"output_bbox_column_name": "bbox",
"chunk_size_tokens": 128,
"chunk_overlap_tokens": 30,
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
Expand All @@ -32,12 +34,19 @@
],
"num_processors": 0
},
"execution_stats": {
"cpus": 27.9,
"gpus": 0,
"memory": 25.75,
"object_store": 0,
"execution time, min": 0.021
},
"job_output_stats": {
"source_files": 1,
"source_size": 50276,
"result_files": 1,
"result_size": 31246,
"processing_time": 0.071,
"result_size": 31223,
"processing_time": 1.266,
"nfiles": 1,
"nrows": 88,
"source_doc_count": 1,
Expand Down
Binary file modified transforms/language/doc_chunk/ray/test-data/expected/test1.parquet
Binary file not shown.
8 changes: 4 additions & 4 deletions transforms/language/pdf2parquet/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
data-prep-toolkit==0.2.2.dev1
docling-core==1.3.0
docling-ibm-models==1.1.7
deepsearch-glm==0.21.0
docling==1.11.0
docling-core==1.7.2
docling-ibm-models==2.0.0
deepsearch-glm==0.22.0
docling==1.20.0
filetype >=1.2.0, <2.0.0
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-08-22 16:01:48",
"end_time": "2024-08-22 16:02:07",
"start_time": "2024-10-18 06:02:44",
"end_time": "2024-10-18 06:03:04",
"status": "success"
},
"code": {
Expand All @@ -29,12 +29,19 @@
],
"num_processors": 0
},
"execution_stats": {
"cpus": 29.2,
"gpus": 0,
"memory": 29.7,
"object_store": 0,
"execution time, min": 0.329
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 31976,
"processing_time": 12.429,
"result_size": 32086,
"processing_time": 5.981,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-09-10 21:28:38",
"end_time": "2024-09-10 21:28:47",
"start_time": "2024-10-18 06:09:35",
"end_time": "2024-10-18 06:09:44",
"status": "success"
},
"code": {
Expand All @@ -29,12 +29,19 @@
],
"num_processors": 0
},
"execution_stats": {
"cpus": 25.3,
"gpus": 0,
"memory": 29.52,
"object_store": 0,
"execution time, min": 0.138
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 33316,
"processing_time": 6.048,
"result_size": 33227,
"processing_time": 5.64,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-08-22 16:04:27",
"end_time": "2024-08-22 16:04:42",
"start_time": "2024-10-18 06:09:08",
"end_time": "2024-10-18 06:09:12",
"status": "success"
},
"code": {
Expand All @@ -29,12 +29,19 @@
],
"num_processors": 0
},
"execution_stats": {
"cpus": 25.5,
"gpus": 0,
"memory": 27.42,
"object_store": 0,
"execution time, min": 0.066
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 28828,
"processing_time": 10.41,
"result_size": 27574,
"processing_time": 3.448,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
Expand Down
Binary file not shown.
8 changes: 4 additions & 4 deletions transforms/language/pdf2parquet/ray/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
dpk-pdf2parquet-transform-python==0.2.2.dev1
data-prep-toolkit-ray==0.2.2.dev1
docling-core==1.3.0
docling-ibm-models==1.1.7
deepsearch-glm==0.21.0
docling==1.11.0
docling-core==1.7.2
docling-ibm-models==2.0.0
deepsearch-glm==0.22.0
docling==1.20.0
filetype >=1.2.0, <2.0.0
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-08-22 16:01:48",
"end_time": "2024-08-22 16:02:07",
"start_time": "2024-10-18 06:02:44",
"end_time": "2024-10-18 06:03:04",
"status": "success"
},
"code": {
Expand All @@ -29,12 +29,19 @@
],
"num_processors": 0
},
"execution_stats": {
"cpus": 29.2,
"gpus": 0,
"memory": 29.7,
"object_store": 0,
"execution time, min": 0.329
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 31976,
"processing_time": 12.429,
"result_size": 32086,
"processing_time": 5.981,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
Expand Down
Binary file not shown.

0 comments on commit c56f016

Please sign in to comment.