#include "ltocr.h"
L_LTOCR_API L_INT EXT_FUNCTION L_OcrPage_GetRecognizedCharacters(page, pageCharacters)
Gets the last recognized character data of this L_OcrPage.
Handle to the OCR page.
Address to L_OcrPageCharacters structure to be updated with page recognized characters. You should call L_OcrPage_FreePageCharacters on the 'pageCharacters' parameter to free its allocated memory when no longer needed.
| Value | Meaning |
|---|---|
| SUCCESS | The function was successful. |
| < 1 | An error occurred. Refer to Return Codes. |
You must call this method after the L_OcrPage has been recognized with the L_OcrPage_Recognize method. i.e., if the value of the L_OcrPage_IsRecognized method of this page is L_FALSE, then calling this method will return SUCCESS and 'pageCharacters' parameter won't be updated.
You can use the L_OcrPage_GetRecognizedCharacters to examine the recognized character data. This data contain information about the character codes, their confidence, guess codes, location and position in the page as well as font information. For more information, refer to L_OcrCharacter.
If you wish to modify and the apply recognition data back to the page, Use L_OcrPage_SetRecognizedCharacters.
Use L_OcrPage_GetZoneWords to get the recognized words of a zone.
Note: The LEADTOOLS OCR Module - LEAD Engine will not return any space characters when using the L_OcrPage_GetRecognizedCharacters method.
The L_OcrPage_SetRecognizedCharacters method will accept space characters in the LEADTOOLS LEAD engine. However, these space characters will be used when generating the final document (PDF) and might affect the final output. Therefore, it is not recommended that you insert space characters when using the LEADTOOLS LEAD engine.
Note: You should call L_OcrPage_FreePageCharacters on the 'pageCharacters' parameter to free its allocated memory when no longer needed.
Required DLLs and Libraries
L_INT L_OcrPage_GetRecognizedCharactersExample(){// Create an image with some text in itBITMAPHANDLE bitmap = { 0 };L_OcrEngine ocrEngine = NULL;L_OcrPage ocrPage = NULL;L_OcrPageCharacters ocrPageCharacters = { 0 };L_OcrDocumentManager ocrDocumentManager = NULL;L_OcrDocument ocrDocument = NULL;// Create an image to write text onL_CreateBitmap(&bitmap, sizeof(BITMAPHANDLE), TYPE_CONV, 640, 200, 24, ORDER_BGR, NULL, TOP_LEFT, NULL, 0);// Create a device context to write withL_HDC LeadDC = L_CreateLeadDC(&bitmap);L_INT StartGDIX = 0, /* Drawing coordinates */StartGDIY = 0,EndGDIX = BITMAPWIDTH(&bitmap),EndGDIY = BITMAPHEIGHT(&bitmap);if(LeadDC != NULL){HFONT hFont;RECT drawArea;// Correct viewer coordinates if necessaryif (bitmap.ViewPerspective != TOP_LEFT){L_PointToBitmap ( &bitmap, TOP_LEFT, & StartGDIX, & StartGDIY );L_PointToBitmap ( &bitmap, TOP_LEFT, & EndGDIX, & EndGDIY );}SelectObject(LeadDC, GetStockObject(WHITE_PEN));SelectObject(LeadDC, GetStockObject(NULL_BRUSH));SetRect(&drawArea, StartGDIX, StartGDIY, EndGDIX, EndGDIY);// Make the image whiteFillRect(LeadDC, &drawArea, CreateSolidBrush(RGB(255,255,255)));// Set font properties for drawinghFont = CreateFont(20, 0, 0, 0, FW_NORMAL, FALSE, FALSE, FALSE, DEFAULT_CHARSET, OUT_OUTLINE_PRECIS,CLIP_DEFAULT_PRECIS, DEFAULT_QUALITY, VARIABLE_PITCH, TEXT("Arial"));SelectObject(LeadDC, hFont);// Now write some textSetRect(&drawArea, 0, 0, 100, 20);int numChars = 11;DrawText(LeadDC, TEXT("Normal line"), numChars, &drawArea, DT_TOP | DT_LEFT);// Change font propertieshFont = CreateFont(20, 0, 0, 0, FW_BOLD, TRUE, TRUE, FALSE, DEFAULT_CHARSET, OUT_OUTLINE_PRECIS,CLIP_DEFAULT_PRECIS, CLEARTYPE_QUALITY, VARIABLE_PITCH, TEXT("Arial"));SelectObject(LeadDC,hFont);// Write a second lineSetRect(&drawArea, 0, 40, 200, 100);numChars = 26;DrawText(LeadDC, TEXT("Bold, italic and underline"), numChars, &drawArea, DT_TOP | DT_LEFT);// Change font properties againhFont = CreateFont(20, 0, 0, 0, FW_DONTCARE, FALSE, FALSE, FALSE, DEFAULT_CHARSET, OUT_OUTLINE_PRECIS,CLIP_DEFAULT_PRECIS, ANTIALIASED_QUALITY, VARIABLE_PITCH, TEXT("Courier New"));SelectObject(LeadDC,hFont);// Write a third lineSetRect(&drawArea, 0, 80, 160, 100);numChars = 15;DrawText(LeadDC, TEXT("Monospaced line"), numChars, &drawArea, DT_TOP | DT_LEFT);DeleteObject(hFont);}// We don't need this context anymore, so free itL_DeleteLeadDC(LeadDC);// Create an instance of the engineL_INT retCode = L_OcrEngineManager_CreateEngine(L_OcrEngineType_Advantage, &ocrEngine);if(retCode != SUCCESS)return retCode;// Start the engine using default parametersL_OcrEngine_Startup(ocrEngine, NULL, OCR_ADVANTAGE_RUNTIME_DIR);// Add this image toan OCR pageL_OcrPage_FromBitmap(ocrEngine, &ocrPage, &bitmap, L_OcrBitmapSharingMode_AutoFree, NULL, NULL);// Transfer ownership to the pagebitmap.Flags.Allocated = 0;// Recognize this pageL_OcrPage_Recognize(ocrPage, NULL, NULL);// Dump the characters to standard outputocrPageCharacters.StructSize = sizeof(L_OcrPageCharacters);L_OcrPage_GetRecognizedCharacters(ocrPage, &ocrPageCharacters);L_UINT*map = NULL;L_UINT mapSize = 0;L_OcrPageSortedZonesIndexMapOptions mapOptions = { 0 };mapOptions.StructSize = sizeof(L_OcrPageSortedZonesIndexMapOptions);mapOptions.Flags = L_OcrPageSortedZonesIndexMapFlags_TableCellsAsOne;L_OcrPage_GetSortedZonesIndexMap(ocrPage, &mapOptions, &map, &mapSize);L_UINT zoneCount = 0;L_OcrPage_GetZoneCount(ocrPage, &zoneCount);for(L_UINT zoneNum = 0; zoneNum < zoneCount; zoneNum++){// Get the recognized wordsL_OcrWords ocrWords = { 0 };ocrWords.StructSize = sizeof(L_OcrWords);L_OcrPage_GetZoneWords(&ocrPageCharacters, map[zoneNum], &ocrWords);std::wcout << L"Words in zone " << zoneNum << ":\n";for(L_UINT wordIndex = 0; wordIndex < ocrWords.WordCount; wordIndex++){L_OcrWord ocrWord = ocrWords.Words[wordIndex];// Output word infostd::wcout << L"Word: " << ocrWord.Buffer << L", at ("<< ocrWord.Bounds.left << L", " << ocrWord.Bounds.top<< L", " << ocrWords.Words[wordIndex].Bounds.right << L", "<< ocrWord.Bounds.bottom << L"), characters index from "<< ocrWord.FirstCharacterIndex << L" to "<< ocrWord.LastCharacterIndex << std::endl;}// Get the data on the individual charactersL_OcrZoneCharacters* zoneChars = ocrPageCharacters.ZoneCharacters;bool nextCharacterIsNewWord = true;L_UINT charIndex = 0;while(charIndex < zoneChars->CharacterCount){// Get a specific characterL_OcrCharacter ocrCharacter = ocrPageCharacters.ZoneCharacters[zoneNum].Characters[charIndex];// Capitalize the first letter if this is a new wordif (nextCharacterIsNewWord)ocrCharacter.Code = (L_WCHAR)toupper(ocrCharacter.Code);// Output individual character informationstd::wcout << L"Code: " << ocrCharacter.Code<< L", Confidence: " << ocrCharacter.Confidence<< L", WordIsCertain: " << ocrCharacter.WordIsCertain<< L", Bounds: (" << ocrCharacter.Bounds.left << L", " << ocrCharacter.Bounds.top << L", "<< ocrCharacter.Bounds.right << L", " << ocrCharacter.Bounds.bottom<< L") , Position: " << ocrCharacter.Positions<< L", FontSize: " << ocrCharacter.FontSize<< L", FontStyle: " << ocrCharacter.FontStyles<< std::endl;// If the charcater is bold, make it underlineif ((ocrCharacter.FontStyles & L_OcrCharacterFontStyles_Bold) == L_OcrCharacterFontStyles_Bold){ocrCharacter.FontStyles |= L_OcrCharacterFontStyles_Italic;ocrCharacter.FontStyles |= L_OcrCharacterFontStyles_Underline;}// Check if next character is the start of a new wordif ((ocrCharacter.Positions & L_OcrCharacterPositions_EndOfWord) == L_OcrCharacterPositions_EndOfWord ||(ocrCharacter.Positions & L_OcrCharacterPositions_EndOfLine) == L_OcrCharacterPositions_EndOfLine)nextCharacterIsNewWord = true;elsenextCharacterIsNewWord = false;// Make change with our copy of dataocrPageCharacters.ZoneCharacters[zoneNum].Characters[charIndex] = ocrCharacter;// Go to the next charactercharIndex++;}// For output spacingstd::wcout << std::endl;// Free this now that we are done with itL_OcrPage_FreeWords(&ocrWords);}// Update the engine with our character changesL_OcrPage_SetRecognizedCharacters(ocrPage, &ocrPageCharacters);// Release the dataL_OcrPage_FreePageCharacters(&ocrPageCharacters);// Create an OCR documentL_OcrEngine_GetDocumentManager(ocrEngine, &ocrDocumentManager);// Show the recognition results// Set the PDF options to save as PDF/A text onlyDOCWRTPDFOPTIONS pdfOptions;pdfOptions.Options.uStructSize = sizeof(DOCWRTPDFOPTIONS);L_OcrDocumentManager_GetFormatOptions(ocrDocumentManager, DOCUMENTFORMAT_PDF, &pdfOptions.Options);// Set the specific PDF options we wantpdfOptions.FontEmbed = DOCWRTFONTEMBED_AUTO;pdfOptions.bImageOverText = false;pdfOptions.PdfProfile = DOCWRTPDFPROFILE_PDFA;// Give the engine our updated PDF optionsL_OcrDocumentManager_SetFormatOptions(ocrDocumentManager, DOCUMENTFORMAT_PDF, &pdfOptions.Options);// Create an OCR documentL_OcrDocumentManager_CreateDocument(ocrDocumentManager, &ocrDocument, L_OcrCreateDocumentOptions_AutoDeleteFile, NULL);// In Document File Mode, add OcrPage to OcrDocument after recognitionL_OcrDocument_AddPage(ocrDocument, ocrPage);// Free this now that we are done with itL_OcrPage_Destroy(ocrPage);// Save the outputL_OcrDocument_Save(ocrDocument, MAKE_IMAGE_PATH(L_TEXT("MyImageWithTest.pdf")), DOCUMENTFORMAT_PDF, NULL, NULL);// CLEANUPif(bitmap.Flags.Allocated)L_FreeBitmap(&bitmap);// Free allocated sorted zones map bufferif(map != NULL)L_OcrMemory_Free(map);// Destroy the documentL_OcrDocument_Destroy(ocrDocument);// Shutdown the engineL_OcrEngine_Destroy(ocrEngine);// Open and check the result file, it should contain the following text// "Normal Line"// "Bold And Italic Line"// "Monospaced Line"// With the second line bold and underlined nowreturn SUCCESS;}