L_OcrPage_GetRecognizedCharacters (original) (raw)

Summary

Gets the last recognized character data of this L_OcrPage.

Syntax

#include "ltocr.h"

L_LTOCR_API L_INT EXT_FUNCTION L_OcrPage_GetRecognizedCharacters(page, pageCharacters)

Parameters

L_OcrPage page

Handle to the OCR page.

L_OcrPageCharacters* pageCharacters

Address to L_OcrPageCharacters structure to be updated with page recognized characters. You should call L_OcrPage_FreePageCharacters on the 'pageCharacters' parameter to free its allocated memory when no longer needed.

Returns

Value Meaning
SUCCESS The function was successful.
< 1 An error occurred. Refer to Return Codes.

Comments

You must call this method after the L_OcrPage has been recognized with the L_OcrPage_Recognize method. i.e., if the value of the L_OcrPage_IsRecognized method of this page is L_FALSE, then calling this method will return SUCCESS and 'pageCharacters' parameter won't be updated.

You can use the L_OcrPage_GetRecognizedCharacters to examine the recognized character data. This data contain information about the character codes, their confidence, guess codes, location and position in the page as well as font information. For more information, refer to L_OcrCharacter.

If you wish to modify and the apply recognition data back to the page, Use L_OcrPage_SetRecognizedCharacters.

Use L_OcrPage_GetZoneWords to get the recognized words of a zone.

Note: The LEADTOOLS OCR Module - LEAD Engine will not return any space characters when using the L_OcrPage_GetRecognizedCharacters method.

The L_OcrPage_SetRecognizedCharacters method will accept space characters in the LEADTOOLS LEAD engine. However, these space characters will be used when generating the final document (PDF) and might affect the final output. Therefore, it is not recommended that you insert space characters when using the LEADTOOLS LEAD engine.

Note: You should call L_OcrPage_FreePageCharacters on the 'pageCharacters' parameter to free its allocated memory when no longer needed.

Required DLLs and Libraries

See Also

Functions

Topics

Example

L_INT L_OcrPage_GetRecognizedCharactersExample() { // Create an image with some text in it BITMAPHANDLE bitmap = { 0 }; L_OcrEngine ocrEngine = NULL; L_OcrPage ocrPage = NULL; L_OcrPageCharacters ocrPageCharacters = { 0 }; L_OcrDocumentManager ocrDocumentManager = NULL; L_OcrDocument ocrDocument = NULL; // Create an image to write text on L_CreateBitmap(&bitmap, sizeof(BITMAPHANDLE), TYPE_CONV, 640, 200, 24, ORDER_BGR, NULL, TOP_LEFT, NULL, 0); // Create a device context to write with L_HDC LeadDC = L_CreateLeadDC(&bitmap); L_INT StartGDIX = 0, /* Drawing coordinates */ StartGDIY = 0, EndGDIX = BITMAPWIDTH(&bitmap), EndGDIY = BITMAPHEIGHT(&bitmap); if(LeadDC != NULL) { HFONT hFont; RECT drawArea; // Correct viewer coordinates if necessary if (bitmap.ViewPerspective != TOP_LEFT) { L_PointToBitmap ( &bitmap, TOP_LEFT, & StartGDIX, & StartGDIY ); L_PointToBitmap ( &bitmap, TOP_LEFT, & EndGDIX, & EndGDIY ); } SelectObject(LeadDC, GetStockObject(WHITE_PEN)); SelectObject(LeadDC, GetStockObject(NULL_BRUSH)); SetRect(&drawArea, StartGDIX, StartGDIY, EndGDIX, EndGDIY); // Make the image white FillRect(LeadDC, &drawArea, CreateSolidBrush(RGB(255,255,255))); // Set font properties for drawing hFont = CreateFont(20, 0, 0, 0, FW_NORMAL, FALSE, FALSE, FALSE, DEFAULT_CHARSET, OUT_OUTLINE_PRECIS, CLIP_DEFAULT_PRECIS, DEFAULT_QUALITY, VARIABLE_PITCH, TEXT("Arial")); SelectObject(LeadDC, hFont); // Now write some text SetRect(&drawArea, 0, 0, 100, 20); int numChars = 11; DrawText(LeadDC, TEXT("Normal line"), numChars, &drawArea, DT_TOP | DT_LEFT); // Change font properties hFont = CreateFont(20, 0, 0, 0, FW_BOLD, TRUE, TRUE, FALSE, DEFAULT_CHARSET, OUT_OUTLINE_PRECIS, CLIP_DEFAULT_PRECIS, CLEARTYPE_QUALITY, VARIABLE_PITCH, TEXT("Arial")); SelectObject(LeadDC,hFont); // Write a second line SetRect(&drawArea, 0, 40, 200, 100); numChars = 26; DrawText(LeadDC, TEXT("Bold, italic and underline"), numChars, &drawArea, DT_TOP | DT_LEFT); // Change font properties again hFont = CreateFont(20, 0, 0, 0, FW_DONTCARE, FALSE, FALSE, FALSE, DEFAULT_CHARSET, OUT_OUTLINE_PRECIS, CLIP_DEFAULT_PRECIS, ANTIALIASED_QUALITY, VARIABLE_PITCH, TEXT("Courier New")); SelectObject(LeadDC,hFont); // Write a third line SetRect(&drawArea, 0, 80, 160, 100); numChars = 15; DrawText(LeadDC, TEXT("Monospaced line"), numChars, &drawArea, DT_TOP | DT_LEFT); DeleteObject(hFont); } // We don't need this context anymore, so free it L_DeleteLeadDC(LeadDC); // Create an instance of the engine L_INT retCode = L_OcrEngineManager_CreateEngine(L_OcrEngineType_LEAD, &ocrEngine); if(retCode != SUCCESS) return retCode; // Start the engine using default parameters L_OcrEngine_Startup(ocrEngine, NULL, OCR_LEAD_RUNTIME_DIR); // Add this image toan OCR page L_OcrPage_FromBitmap(ocrEngine, &ocrPage, &bitmap, L_OcrBitmapSharingMode_AutoFree, NULL, NULL); // Transfer ownership to the page bitmap.Flags.Allocated = 0; // Recognize this page L_OcrPage_Recognize(ocrPage, NULL, NULL); // Dump the characters to standard output ocrPageCharacters.StructSize = sizeof(L_OcrPageCharacters); L_OcrPage_GetRecognizedCharacters(ocrPage, &ocrPageCharacters); L_UINT*map = NULL; L_UINT mapSize = 0; L_OcrPageSortedZonesIndexMapOptions mapOptions = { 0 }; mapOptions.StructSize = sizeof(L_OcrPageSortedZonesIndexMapOptions); mapOptions.Flags = L_OcrPageSortedZonesIndexMapFlags_TableCellsAsOne; L_OcrPage_GetSortedZonesIndexMap(ocrPage, &mapOptions, &map, &mapSize); L_UINT zoneCount = 0; L_OcrPage_GetZoneCount(ocrPage, &zoneCount); for(L_UINT zoneNum = 0; zoneNum < zoneCount; zoneNum++) { // Get the recognized words L_OcrWords ocrWords = { 0 }; ocrWords.StructSize = sizeof(L_OcrWords); L_OcrPage_GetZoneWords(&ocrPageCharacters, map[zoneNum], &ocrWords); std::wcout << L"Words in zone " << zoneNum << ":\n"; for(L_UINT wordIndex = 0; wordIndex < ocrWords.WordCount; wordIndex++) { L_OcrWord ocrWord = ocrWords.Words[wordIndex]; // Output word info std::wcout << L"Word: " << ocrWord.Buffer << L", at (" << ocrWord.Bounds.left << L", " << ocrWord.Bounds.top << L", " << ocrWords.Words[wordIndex].Bounds.right << L", " << ocrWord.Bounds.bottom << L"), characters index from " << ocrWord.FirstCharacterIndex << L" to " << ocrWord.LastCharacterIndex << std::endl; } // Get the data on the individual characters L_OcrZoneCharacters* zoneChars = ocrPageCharacters.ZoneCharacters; bool nextCharacterIsNewWord = true; L_UINT charIndex = 0; while(charIndex < zoneChars->CharacterCount) { // Get a specific character L_OcrCharacter ocrCharacter = ocrPageCharacters.ZoneCharacters[zoneNum].Characters[charIndex]; // Capitalize the first letter if this is a new word if (nextCharacterIsNewWord) ocrCharacter.Code = (L_WCHAR)toupper(ocrCharacter.Code); // Output individual character information std::wcout << L"Code: " << ocrCharacter.Code << L", Confidence: " << ocrCharacter.Confidence << L", WordIsCertain: " << ocrCharacter.WordIsCertain << L", Bounds: (" << ocrCharacter.Bounds.left << L", " << ocrCharacter.Bounds.top << L", " << ocrCharacter.Bounds.right << L", " << ocrCharacter.Bounds.bottom << L") , Position: " << ocrCharacter.Positions << L", FontSize: " << ocrCharacter.FontSize << L", FontStyle: " << ocrCharacter.FontStyles << std::endl; // If the charcater is bold, make it underline if ((ocrCharacter.FontStyles & L_OcrCharacterFontStyles_Bold) == L_OcrCharacterFontStyles_Bold) { ocrCharacter.FontStyles |= L_OcrCharacterFontStyles_Italic; ocrCharacter.FontStyles |= L_OcrCharacterFontStyles_Underline; } // Check if next character is the start of a new word if ((ocrCharacter.Positions & L_OcrCharacterPositions_EndOfWord) == L_OcrCharacterPositions_EndOfWord || (ocrCharacter.Positions & L_OcrCharacterPositions_EndOfLine) == L_OcrCharacterPositions_EndOfLine) nextCharacterIsNewWord = true; else nextCharacterIsNewWord = false; // Make change with our copy of data ocrPageCharacters.ZoneCharacters[zoneNum].Characters[charIndex] = ocrCharacter; // Go to the next character charIndex++; } // For output spacing std::wcout << std::endl; // Free this now that we are done with it L_OcrPage_FreeWords(&ocrWords); } // Update the engine with our character changes L_OcrPage_SetRecognizedCharacters(ocrPage, &ocrPageCharacters); // Release the data L_OcrPage_FreePageCharacters(&ocrPageCharacters); // Create an OCR document L_OcrEngine_GetDocumentManager(ocrEngine, &ocrDocumentManager); // Show the recognition results // Set the PDF options to save as PDF/A text only DOCWRTPDFOPTIONS pdfOptions; pdfOptions.Options.uStructSize = sizeof(DOCWRTPDFOPTIONS); L_OcrDocumentManager_GetFormatOptions(ocrDocumentManager, DOCUMENTFORMAT_PDF, &pdfOptions.Options); // Set the specific PDF options we want pdfOptions.FontEmbed = DOCWRTFONTEMBED_AUTO; pdfOptions.bImageOverText = false; pdfOptions.PdfProfile = DOCWRTPDFPROFILE_PDFA; // Give the engine our updated PDF options L_OcrDocumentManager_SetFormatOptions(ocrDocumentManager, DOCUMENTFORMAT_PDF, &pdfOptions.Options); // Create an OCR document L_OcrDocumentManager_CreateDocument(ocrDocumentManager, &ocrDocument, L_OcrCreateDocumentOptions_AutoDeleteFile, NULL); // In Document File Mode, add OcrPage to OcrDocument after recognition L_OcrDocument_AddPage(ocrDocument, ocrPage); // Free this now that we are done with it L_OcrPage_Destroy(ocrPage); // Save the output L_OcrDocument_Save(ocrDocument, MAKE_IMAGE_PATH(L_TEXT("MyImageWithTest.pdf")), DOCUMENTFORMAT_PDF, NULL, NULL); // CLEANUP if(bitmap.Flags.Allocated) L_FreeBitmap(&bitmap); // Free allocated sorted zones map buffer if(map != NULL) L_OcrMemory_Free(map); // Destroy the document L_OcrDocument_Destroy(ocrDocument); // Shutdown the engine L_OcrEngine_Destroy(ocrEngine); // Open and check the result file, it should contain the following text // "Normal Line" // "Bold And Italic Line" // "Monospaced Line" // With the second line bold and underlined now return SUCCESS; }