diff --git a/client.go b/client.go index fc5b0ea..fe8dafc 100644 --- a/client.go +++ b/client.go @@ -73,9 +73,9 @@ type Client struct { // NewClient construct new Client. It's due to caller to Close this client. func NewClient() *Client { client := &Client{ - api: C.Create(), - Variables: map[SettableVariable]string{}, - Trim: true, + api: C.Create(), + Variables: map[SettableVariable]string{}, + Trim: true, shouldInit: true, } return client @@ -164,6 +164,7 @@ func (client *Client) SetLanguage(langs ...string) error { return nil } +// DisableOutput ... func (client *Client) DisableOutput() error { err := client.SetVariable(DEBUG_FILE, os.DevNull) @@ -332,9 +333,10 @@ func (client *Client) HOCRText() (out string, err error) { // BoundingBox contains the position, confidence and UTF8 text of the recognized word type BoundingBox struct { - Box image.Rectangle - Word string - Confidence float64 + Box image.Rectangle + Word string + Confidence float64 + BlockNum, ParNum, LineNum, WordNum int } // GetBoundingBoxes returns bounding boxes for each matched word @@ -364,16 +366,46 @@ func (client *Client) GetBoundingBoxes(level PageIteratorLevel) (out []BoundingB } // GetAvailableLanguages returns a list of available languages in the default tesspath -func GetAvailableLanguages() (languages []string, err error) { +func GetAvailableLanguages() ([]string, error) { path := C.GoString(C.GetDataPath()) - if languages, err = filepath.Glob(filepath.Join(path, "*.traineddata")); err != nil { - return + languages, err := filepath.Glob(filepath.Join(path, "*.traineddata")) + if err != nil { + return languages, err } for i := 0; i < len(languages); i++ { languages[i] = filepath.Base(languages[i]) idx := strings.Index(languages[i], ".") languages[i] = languages[i][:idx] } + return languages, nil +} + +// GetBoundingBoxesVerbose returns bounding boxes at word level with block_num, par_num, line_num and word_num +// according to the c++ api that returns a formatted TSV output. Reference: `TessBaseAPI::GetTSVText`. +func (client *Client) GetBoundingBoxesVerbose() (out []BoundingBox, err error) { + if client.api == nil { + return out, fmt.Errorf("TessBaseAPI is not constructed, please use `gosseract.NewClient`") + } + if err = client.init(); err != nil { + return + } + boxArray := C.GetBoundingBoxesVerbose(client.api) + length := int(boxArray.length) + defer C.free(unsafe.Pointer(boxArray.boxes)) + defer C.free(unsafe.Pointer(boxArray)) + for i := 0; i < length; i++ { + // cast to bounding_box: boxes + i*sizeof(box) + box := (*C.struct_bounding_box)(unsafe.Pointer(uintptr(unsafe.Pointer(boxArray.boxes)) + uintptr(i)*unsafe.Sizeof(C.struct_bounding_box{}))) + out = append(out, BoundingBox{ + Box: image.Rect(int(box.x1), int(box.y1), int(box.x2), int(box.y2)), + Word: C.GoString(box.word), + Confidence: float64(box.confidence), + BlockNum: int(box.block_num), + ParNum: int(box.par_num), + LineNum: int(box.line_num), + WordNum: int(box.word_num), + }) + } return } diff --git a/tessbridge.cpp b/tessbridge.cpp index 9b4204c..4f10819 100644 --- a/tessbridge.cpp +++ b/tessbridge.cpp @@ -1,156 +1,221 @@ #if __FreeBSD__ >= 10 -#include "/usr/local/include/tesseract/baseapi.h" #include "/usr/local/include/leptonica/allheaders.h" +#include "/usr/local/include/tesseract/baseapi.h" #else -#include #include +#include #endif -#include "tessbridge.h" #include #include +#include "tessbridge.h" TessBaseAPI Create() { - tesseract::TessBaseAPI * api = new tesseract::TessBaseAPI(); - return (void*)api; + tesseract::TessBaseAPI* api = new tesseract::TessBaseAPI(); + return (void*)api; } void Free(TessBaseAPI a) { - tesseract::TessBaseAPI * api = (tesseract::TessBaseAPI*)a; - api->End(); - delete api; + tesseract::TessBaseAPI* api = (tesseract::TessBaseAPI*)a; + api->End(); + delete api; } void Clear(TessBaseAPI a) { - tesseract::TessBaseAPI * api = (tesseract::TessBaseAPI*)a; - api->Clear(); + tesseract::TessBaseAPI* api = (tesseract::TessBaseAPI*)a; + api->Clear(); } void ClearPersistentCache(TessBaseAPI a) { - tesseract::TessBaseAPI * api = (tesseract::TessBaseAPI*)a; - api->ClearPersistentCache(); + tesseract::TessBaseAPI* api = (tesseract::TessBaseAPI*)a; + api->ClearPersistentCache(); } int Init(TessBaseAPI a, char* tessdataprefix, char* languages) { - tesseract::TessBaseAPI * api = (tesseract::TessBaseAPI*)a; - return api->Init(tessdataprefix, languages); + tesseract::TessBaseAPI* api = (tesseract::TessBaseAPI*)a; + return api->Init(tessdataprefix, languages); } int Init(TessBaseAPI a, char* tessdataprefix, char* languages, char* configfilepath, char* errbuf) { - tesseract::TessBaseAPI * api = (tesseract::TessBaseAPI*)a; - - // {{{ Redirect STDERR to given buffer - fflush(stderr); - int original_stderr; - original_stderr = dup(STDERR_FILENO); - (void) freopen("/dev/null", "a", stderr); - setbuf(stderr, errbuf); - // }}} - - int ret; - if (configfilepath != NULL) { - char *configs[]={configfilepath}; - int configs_size = 1; - ret = api->Init(tessdataprefix, languages, tesseract::OEM_DEFAULT, configs, configs_size, NULL, NULL, false); - } else { - ret = api->Init(tessdataprefix, languages); - } - - // {{{ Restore default stderr - (void) freopen("/dev/null", "a", stderr); - dup2(original_stderr, STDERR_FILENO); - setbuf(stderr, NULL); - // }}} - - return ret; + tesseract::TessBaseAPI* api = (tesseract::TessBaseAPI*)a; + + // {{{ Redirect STDERR to given buffer + fflush(stderr); + int original_stderr; + original_stderr = dup(STDERR_FILENO); + (void)freopen("/dev/null", "a", stderr); + setbuf(stderr, errbuf); + // }}} + + int ret; + if (configfilepath != NULL) { + char* configs[] = {configfilepath}; + int configs_size = 1; + ret = api->Init(tessdataprefix, languages, tesseract::OEM_DEFAULT, configs, configs_size, NULL, NULL, false); + } else { + ret = api->Init(tessdataprefix, languages); + } + + // {{{ Restore default stderr + (void)freopen("/dev/null", "a", stderr); + dup2(original_stderr, STDERR_FILENO); + setbuf(stderr, NULL); + // }}} + + return ret; } bool SetVariable(TessBaseAPI a, char* name, char* value) { - tesseract::TessBaseAPI * api = (tesseract::TessBaseAPI*)a; - return api->SetVariable(name, value); + tesseract::TessBaseAPI* api = (tesseract::TessBaseAPI*)a; + return api->SetVariable(name, value); } void SetPixImage(TessBaseAPI a, PixImage pix) { - tesseract::TessBaseAPI * api = (tesseract::TessBaseAPI*)a; - Pix *image = (Pix*) pix; - api->SetImage(image); - if (api->GetSourceYResolution() < 70) { - api->SetSourceResolution(70); - } + tesseract::TessBaseAPI* api = (tesseract::TessBaseAPI*)a; + Pix* image = (Pix*)pix; + api->SetImage(image); + if (api->GetSourceYResolution() < 70) { + api->SetSourceResolution(70); + } } void SetPageSegMode(TessBaseAPI a, int m) { - tesseract::TessBaseAPI * api = (tesseract::TessBaseAPI*)a; - tesseract::PageSegMode mode = (tesseract::PageSegMode)m; - api->SetPageSegMode(mode); + tesseract::TessBaseAPI* api = (tesseract::TessBaseAPI*)a; + tesseract::PageSegMode mode = (tesseract::PageSegMode)m; + api->SetPageSegMode(mode); } int GetPageSegMode(TessBaseAPI a) { - tesseract::TessBaseAPI * api = (tesseract::TessBaseAPI*)a; - return api->GetPageSegMode(); + tesseract::TessBaseAPI* api = (tesseract::TessBaseAPI*)a; + return api->GetPageSegMode(); } char* UTF8Text(TessBaseAPI a) { - tesseract::TessBaseAPI * api = (tesseract::TessBaseAPI*)a; - return api->GetUTF8Text(); + tesseract::TessBaseAPI* api = (tesseract::TessBaseAPI*)a; + return api->GetUTF8Text(); } char* HOCRText(TessBaseAPI a) { - tesseract::TessBaseAPI * api = (tesseract::TessBaseAPI*)a; - return api->GetHOCRText(0); + tesseract::TessBaseAPI* api = (tesseract::TessBaseAPI*)a; + return api->GetHOCRText(0); +} + +bounding_boxes* GetBoundingBoxesVerbose(TessBaseAPI a) { + using namespace tesseract; + tesseract::TessBaseAPI* api = (tesseract::TessBaseAPI*)a; + struct bounding_boxes* box_array; + box_array = (bounding_boxes*)malloc(sizeof(bounding_boxes)); + // linearly resize boxes array + int realloc_threshold = 900; + int realloc_raise = 1000; + int capacity = 1000; + box_array->boxes = (bounding_box*)malloc(capacity * sizeof(bounding_box)); + box_array->length = 0; + api->Recognize(NULL); + int block_num = 0; + int par_num = 0; + int line_num = 0; + int word_num = 0; + + ResultIterator* res_it = api->GetIterator(); + while (!res_it->Empty(RIL_BLOCK)) { + if (res_it->Empty(RIL_WORD)) { + res_it->Next(RIL_WORD); + continue; + } + // Add rows for any new block/paragraph/textline. + if (res_it->IsAtBeginningOf(RIL_BLOCK)) { + block_num++; + par_num = 0; + line_num = 0; + word_num = 0; + } + if (res_it->IsAtBeginningOf(RIL_PARA)) { + par_num++; + line_num = 0; + word_num = 0; + } + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + line_num++; + word_num = 0; + } + word_num++; + + if (box_array->length >= realloc_threshold) { + capacity += realloc_raise; + box_array->boxes = (bounding_box*)realloc(box_array->boxes, capacity * sizeof(bounding_box)); + realloc_threshold += realloc_raise; + } + + box_array->boxes[box_array->length].word = res_it->GetUTF8Text(RIL_WORD); + box_array->boxes[box_array->length].confidence = res_it->Confidence(RIL_WORD); + res_it->BoundingBox(RIL_WORD, &box_array->boxes[box_array->length].x1, &box_array->boxes[box_array->length].y1, + &box_array->boxes[box_array->length].x2, &box_array->boxes[box_array->length].y2); + + // block, para, line, word numbers + box_array->boxes[box_array->length].block_num = block_num; + box_array->boxes[box_array->length].par_num = par_num; + box_array->boxes[box_array->length].line_num = line_num; + box_array->boxes[box_array->length].word_num = word_num; + + box_array->length++; + res_it->Next(RIL_WORD); + } + + return box_array; } bounding_boxes* GetBoundingBoxes(TessBaseAPI a, int pageIteratorLevel) { - tesseract::TessBaseAPI * api = (tesseract::TessBaseAPI*)a; - struct bounding_boxes* box_array; - box_array = (bounding_boxes*)malloc(sizeof(bounding_boxes)); - // linearly resize boxes array - int realloc_threshold = 900; - int realloc_raise = 1000; - int capacity = 1000; - box_array->boxes = (bounding_box*)malloc(capacity * sizeof(bounding_box)); - box_array->length = 0; - api->Recognize(NULL); - tesseract::ResultIterator* ri = api->GetIterator(); - tesseract::PageIteratorLevel level = (tesseract::PageIteratorLevel)pageIteratorLevel; - - if (ri != 0) { - do { - if ( box_array->length >= realloc_threshold ) { - capacity += realloc_raise; - box_array->boxes = (bounding_box*)realloc(box_array->boxes, capacity * sizeof(bounding_box)); - realloc_threshold += realloc_raise; - } - box_array->boxes[box_array->length].word = ri->GetUTF8Text(level); - box_array->boxes[box_array->length].confidence = ri->Confidence(level); - ri->BoundingBox(level, &box_array->boxes[box_array->length].x1, &box_array->boxes[box_array->length].y1, &box_array->boxes[box_array->length].x2, &box_array->boxes[box_array->length].y2); - box_array->length++; - } while (ri->Next(level)); - } - - return box_array; + tesseract::TessBaseAPI* api = (tesseract::TessBaseAPI*)a; + struct bounding_boxes* box_array; + box_array = (bounding_boxes*)malloc(sizeof(bounding_boxes)); + // linearly resize boxes array + int realloc_threshold = 900; + int realloc_raise = 1000; + int capacity = 1000; + box_array->boxes = (bounding_box*)malloc(capacity * sizeof(bounding_box)); + box_array->length = 0; + api->Recognize(NULL); + tesseract::ResultIterator* ri = api->GetIterator(); + tesseract::PageIteratorLevel level = (tesseract::PageIteratorLevel)pageIteratorLevel; + + if (ri != 0) { + do { + if (box_array->length >= realloc_threshold) { + capacity += realloc_raise; + box_array->boxes = (bounding_box*)realloc(box_array->boxes, capacity * sizeof(bounding_box)); + realloc_threshold += realloc_raise; + } + box_array->boxes[box_array->length].word = ri->GetUTF8Text(level); + box_array->boxes[box_array->length].confidence = ri->Confidence(level); + ri->BoundingBox(level, &box_array->boxes[box_array->length].x1, &box_array->boxes[box_array->length].y1, + &box_array->boxes[box_array->length].x2, &box_array->boxes[box_array->length].y2); + box_array->length++; + } while (ri->Next(level)); + } + + return box_array; } const char* Version(TessBaseAPI a) { - tesseract::TessBaseAPI * api = (tesseract::TessBaseAPI*)a; - const char* v = api->Version(); - return v; + tesseract::TessBaseAPI* api = (tesseract::TessBaseAPI*)a; + const char* v = api->Version(); + return v; } PixImage CreatePixImageByFilePath(char* imagepath) { - Pix *image = pixRead(imagepath); - return (void*)image; + Pix* image = pixRead(imagepath); + return (void*)image; } PixImage CreatePixImageFromBytes(unsigned char* data, int size) { - Pix *image = pixReadMem(data, (size_t)size); - return (void*)image; + Pix* image = pixReadMem(data, (size_t)size); + return (void*)image; } - -void DestroyPixImage(PixImage pix){ - Pix *img = (Pix*) pix; - pixDestroy(&img); +void DestroyPixImage(PixImage pix) { + Pix* img = (Pix*)pix; + pixDestroy(&img); } const char* GetDataPath() { diff --git a/tessbridge.h b/tessbridge.h index e93d42f..161c08a 100644 --- a/tessbridge.h +++ b/tessbridge.h @@ -6,9 +6,10 @@ typedef void* TessBaseAPI; typedef void* PixImage; struct bounding_box { - int x1,y1,x2,y2; + int x1, y1, x2, y2; char* word; float confidence; + int block_num, par_num, line_num, word_num; }; struct bounding_boxes { @@ -23,6 +24,7 @@ void Clear(TessBaseAPI); void ClearPersistentCache(TessBaseAPI); int Init(TessBaseAPI, char*, char*, char*, char*); struct bounding_boxes* GetBoundingBoxes(TessBaseAPI, int); +struct bounding_boxes* GetBoundingBoxesVerbose(TessBaseAPI); bool SetVariable(TessBaseAPI, char*, char*); void SetPixImage(TessBaseAPI a, PixImage pix); void SetPageSegMode(TessBaseAPI, int); @@ -38,4 +40,4 @@ void DestroyPixImage(PixImage pix); #ifdef __cplusplus } -#endif/* extern "C" */ +#endif /* extern "C" */