diff --git a/.circleci/config.yml b/.circleci/config.yml index f61c996..b1f0af1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7,7 +7,7 @@ version: 2.1 parameters: node_version: type: string - default: '16.15.0' + default: '18.18.2' commands: install_deps: diff --git a/bin/tt.tar.gz b/bin/tt.tar.gz index 8cd6195..62394be 100644 Binary files a/bin/tt.tar.gz and b/bin/tt.tar.gz differ diff --git a/compile-tesseract.sh b/compile-tesseract.sh index 3bcad72..c43c197 100644 --- a/compile-tesseract.sh +++ b/compile-tesseract.sh @@ -1,5 +1,5 @@ # Spin up and enter the docker container on your machine with the following command: -# docker run -it --entrypoint /bin/bash public.ecr.aws/lambda/nodejs:16-x86_64 +# docker run -it --entrypoint /bin/bash public.ecr.aws/lambda/nodejs:18-arm64 # Then run the rest of the commands inside @@ -23,7 +23,7 @@ make install cd ~ git clone https://github.com/tesseract-ocr/tesseract.git cd tesseract -git checkout 5.1.0 +git checkout 5.3.3 export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig ./autogen.sh ./configure @@ -61,6 +61,6 @@ strip ./tesseract-standalone/**/* tar -zcvf tesseract.tar.gz tesseract-standalone # download from docker to local machine -# 22e97e0774a0 is docker container id, you can look it up by running "docker ps" +# d1c431e8c85e is docker container id, you can look it up by running "docker ps" # run this outside of the docker container -docker cp 22e97e0774a0:/root/tesseract.tar.gz tt.tar.gz +docker cp d1c431e8c85e:/root/tesseract.tar.gz tt.tar.gz diff --git a/package.json b/package.json index 09d8b9e..fa7f8fd 100644 --- a/package.json +++ b/package.json @@ -57,7 +57,7 @@ "@shelf/eslint-config": "3.10.0", "@shelf/prettier-config": "1.0.0", "@types/jest": "29.5.7", - "@types/node": "16", + "@types/node": "18", "@types/tar": "6.1.7", "eslint": "8.48.0", "husky": "8.0.3", @@ -67,7 +67,7 @@ "typescript": "5.2.2" }, "engines": { - "node": ">=16" + "node": ">=18" }, "publishConfig": { "access": "public" diff --git a/readme.md b/readme.md index 4a80085..71bc82e 100644 --- a/readme.md +++ b/readme.md @@ -1,6 +1,6 @@ # aws-lambda-tesseract [![CircleCI](https://circleci.com/gh/shelfio/aws-lambda-tesseract/tree/master.svg?style=svg)](https://circleci.com/gh/shelfio/aws-lambda-tesseract/tree/master) ![](https://img.shields.io/badge/code_style-prettier-ff69b4.svg) [![Tesseract](https://img.shields.io/badge/tesserract-6_MB-brightgreen.svg)](bin/) -> 6 MB Tesseract 5.1 (with English training data) to fit inside AWS Lambda +> 6 MB Tesseract 5.3.3 (with English training data) to fit inside AWS Lambda Inspired by [chrome-aws-lambda](https://github.com/alixaxel/chrome-aws-lambda) & [lambda-scanner-ocr](https://github.com/philippkeller/lambda-scanner-ocr) @@ -18,9 +18,11 @@ $ yarn add @shelf/aws-lambda-tesseract `4.x` works for Node 16.x runtime and compiled with **Tesseract 5.1.0**. It works with x86_64 CPUs for now only. +`5.x` works for Node 18.x runtime and compiled with **Tesseract 5.3.3**. It works with arm64 CPUs. + ## How does it work? -This package contains an archive with [Tesseract 5.1](https://github.com/tesseract-ocr/tesseract) compiled for usage in AWS Lambda environment. +This package contains an archive with [Tesseract 5.3.3](https://github.com/tesseract-ocr/tesseract) compiled for usage in AWS Lambda environment. When a Lambda starts, it unpacks an archive with a binary to the `/tmp` folder and makes sure it's done only once per Lambda cold start. diff --git a/test.sh b/test.sh index 1e86dc6..e564189 100755 --- a/test.sh +++ b/test.sh @@ -12,5 +12,5 @@ yarn babel test.ts --out-file test.js docker run --rm \ -v "$PWD":/var/task \ -p 9000:8080 \ - public.ecr.aws/lambda/nodejs:16-x86_64 \ + public.ecr.aws/lambda/nodejs:18-arm64 \ test.handler