OCR Tutorial - Scanning to Searchable PDF

Take the following steps to create and run a program that shows how scan a document and convert it to a searchable PDF file.

  1. Start Visual Studio

  2. Choose File->New->Project from the menu.

  3. In the New Project dialog box, choose either "Visual C# Projects" or "VB Projects" in the Projects Type List, and choose "Windows Application" or "Windows Forms Application" depending on your Visual Studio version from the Templates List.

  4. Type the project name as "OcrTutorial3" in the Project Name field, and then choose OK. If desired, type a new location for your project or select a directory using the Browse button, and then choose OK.

  5. In the "Solution Explorer" window, right-click on the "References" folder, and select "Add Reference..." from the context menu. In the "Add Reference" dialog box, select the ".NET" tab and browse to LEADTOOLS For .NET "<LEADTOOLS_INSTALLDIR>\Bin\DotNet4\Win32" folder and select the following DLLs:

    • Leadtools.dll
    • Leadtools.Codecs.dll
    • Leadtools.Twain.dll
    • Leadtools.ImageProcessing.Core.dll
    • Leadtools.Forms.dll
    • Leadtools.Forms.DocumentWriters.dll
    • Leadtools.Forms.Ocr.dll
    • Leadtools.Forms.Ocr.Advantage.dll
    • Leadtools.Codecs.Bmp.dll
    • Leadtools.Codecs.Cmp.dll
    • Leadtools.Codecs.Tif.dll
    • Leadtools.Codecs.Fax.dll

    Note: The Leadtools.Codecs.*.dll references added are for the BMP, JPG, CMP, TIF and FAX image formats. Add any additional file format codec DLL if required in your application.

  6. Drag and drop three buttons in Form1. Leave all the buttons names as the default "button1, button2 ...", then change the Text property of each button to the following:

    Button Text
    button1 Change output directory
    button2 Select the Scanner
    button3 Scan and OCR
  7. Switch to Form1 code view (Right-click Form1 in the solution explorer then select View Code) and add the following lines at the beginning of the file after any using or Imports section if there are any:

    C#
    VB
    using Leadtools; 
          using Leadtools.Twain; 
          using Leadtools.ImageProcessing; 
          using Leadtools.ImageProcessing.Core; 
          using Leadtools.Forms; 
          using Leadtools.Forms.DocumentWriters; 
          using Leadtools.Forms.Ocr; 
          using Leadtools.ImageProcessing; 
    Imports Leadtools 
          Imports Leadtools.Twain 
          Imports Leadtools.ImageProcessing 
          Imports Leadtools.ImageProcessing.Core 
          Imports Leadtools.Forms 
          Imports Leadtools.Forms.DocumentWriters 
          Imports Leadtools.Forms.Ocr 
          Imports Leadtools.ImageProcessing 

  8. Add the following private variables to the Form1 class:

    C#
    VB
    // The OCR engine instance 
          private IOcrEngine _ocrEngine; 
          // OCR document instance 
          private IOcrDocument _ocrDocument; 
          // The Twain session 
          private TwainSession _twainSession; 
          // The output directory for saving PDF files 
          private string _outputDirectory = @"C:\MyImages"; 
          // The image processing commands we are going to use to clean the scanned image 
          private List<RasterCommand> _imageProcessingCommands; 
          private int _scanCount; 
    ' The OCR engine instance 
          Private _ocrEngine As IOcrEngine 
          ' OCR document instance 
          Private _ocrDocument As IOcrDocument 
          ' The Twain session 
          Private _twainSession As TwainSession 
          ' The output directory for saving PDF files 
          Private _outputDirectory As String = "C:\MyImages" 
          ' The image processing commands we are going to use to clean the scanned image 
          Private _imageProcessingCommands As List(Of RasterCommand) 
          Private _scanCount As Integer 

  9. Override Form1OnLoad and add the following code:

    C#
    VB
    protected override void OnLoad(EventArgs e) 
          { 
             // Initialize the OCR engine 
             _ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Advantage, false); 
             // Startup the engine 
             _ocrEngine.Startup(null, null, null, @"C:\LEADTOOLS 19\Bin\Common\OcrAdvantageRuntime"); 
                     
             // Initialize Twain scanning session 
             _twainSession = new TwainSession(); 
             _twainSession.Startup(this.Handle, "My Company", "My Product", "My Version", "My Application", TwainStartupFlags.None); 
                     
             // Subscribe to the TwainSession.Acquire event to get the image 
             _twainSession.AcquirePage += new EventHandler<TwainAcquirePageEventArgs>(_twainSession_AcquirePage); 
                     
             // Initialize the image processing commands we are going to use 
             // Add as many as you like, here we will add Deskew and Despeckle 
             _imageProcessingCommands = new List<RasterCommand>(); 
             _imageProcessingCommands.Add(new DeskewCommand()); 
             _imageProcessingCommands.Add(new DespeckleCommand()); 
                     
             base.OnLoad(e); 
          } 
    Protected Overrides Sub OnLoad(e As EventArgs) 
             ' Initialize the OCR engine 
             _ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Advantage, False) 
             ' Startup the engine 
             _ocrEngine.Startup(Nothing, Nothing, Nothing, "C:\LEADTOOLS 19\Bin\Common\OcrAdvantageRuntime") 
                     
             ' Initialize Twain scanning session 
             _twainSession = New TwainSession() 
             _twainSession.Startup(Me.Handle, "My Company", "My Product", "My Version", "My Application", TwainStartupFlags.None) 
                     
             ' Subscribe to the TwainSession.Acquire event to get the image 
             AddHandler _twainSession.AcquirePage, AddressOf _twainSession_AcquirePage 
                     
             ' Initialize the image processing commands we are going to use 
             ' Add as many as you like, here we will add Deskew and Despeckle 
             _imageProcessingCommands = New List(Of RasterCommand)() 
             _imageProcessingCommands.Add(New DeskewCommand()) 
             _imageProcessingCommands.Add(New DespeckleCommand()) 
                     
             MyBase.OnLoad(e) 
          End Sub 

  10. Override Form1OnFormClosed and add the following code:

    C#
    VB
    protected override void OnFormClosed(FormClosedEventArgs e) 
          { 
             // Shutdown and dispose the OCR engine 
             _ocrEngine.Dispose(); 
                     
             // And the twain session 
             _twainSession.Shutdown(); 
                     
             base.OnFormClosed(e); 
          } 
    Protected Overrides Sub OnFormClosed(e As FormClosedEventArgs) 
             ' Shutdown and dispose the OCR engine 
             _ocrEngine.Dispose() 
                     
             ' And the twain session 
             _twainSession.Shutdown() 
                     
             MyBase.OnFormClosed(e) 
          End Sub 

  11. Add the following code for the button1 (Change output directory) Click handler:

    C#
    VB
    private void button1_Click(object sender, EventArgs e) 
          { 
             // Change the output directory 
             using (FolderBrowserDialog dlg = new FolderBrowserDialog()) 
             { 
                dlg.SelectedPath = _outputDirectory; 
                dlg.ShowNewFolderButton = true; 
                if (dlg.ShowDialog(this) == DialogResult.OK) 
                   _outputDirectory = System.IO.Path.GetFullPath(dlg.SelectedPath); 
             } 
          } 
    Private Sub button1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles button1.Click 
             ' Change the output directory 
             Using dlg As New FolderBrowserDialog() 
                dlg.SelectedPath = _outputDirectory 
                dlg.ShowNewFolderButton = True 
                If dlg.ShowDialog(Me) = DialogResult.OK Then 
                   _outputDirectory = System.IO.Path.GetFullPath(dlg.SelectedPath) 
                End If 
             End Using 
          End Sub 

  12. Add the following code for the button2 (Select the Scanner) Click handler:

    C#
    VB
    private void button2_Click(object sender, EventArgs e) 
          { 
             // Select the scanner to use 
             _twainSession.SelectSource(null); 
          } 
    Private Sub button2_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles button2.Click 
             ' Select the scanner to use 
             _twainSession.SelectSource(Nothing) 
          End Sub 

  13. Add the following code for the button3 (Scan and Ocr) Click handler:

    C#
    VB
    private void button3_Click(object sender, EventArgs e) 
          { 
             // Create the output directory if it does not exist 
             if(!System.IO.Directory.Exists(_outputDirectory)) 
                System.IO.Directory.CreateDirectory(_outputDirectory); 
                     
             // Build the output PDF file name 
             string name = "Scanned" + _scanCount; 
             _scanCount++; 
             string pdfFileName = System.IO.Path.Combine(_outputDirectory, name + ".pdf"); 
                     
             // Create a new file-based OCR document to add the scanned pages to 
             _ocrDocument = _ocrEngine.DocumentManager.CreateDocument(null, OcrCreateDocumentOptions.AutoDeleteFile); 
                     
             // Scan the new page(s) 
             _twainSession.Acquire(TwainUserInterfaceFlags.Show); 
                     
             // Save as PDF 
             _ocrDocument.Save(pdfFileName, DocumentFormat.Pdf, null); 
                     
             // Delete the document 
             _ocrDocument.Dispose(); 
                     
             // Show the result PDF file 
             System.Diagnostics.Process.Start(pdfFileName); 
          } 
    Private Sub button3_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles button3.Click 
             ' Create the output directory if it does not exist 
             If Not System.IO.Directory.Exists(_outputDirectory) Then 
                System.IO.Directory.CreateDirectory(_outputDirectory) 
             End If 
                     
             ' Build the output PDF file name 
             Dim name As String = "Scanned" + _scanCount 
             _scanCount = _scanCount + 1 
                     
             Dim pdfFileName As String = System.IO.Path.Combine(_outputDirectory, name + ".pdf") 
                     
             ' Create a new file-based OCR document to add the scanned pages to 
             _ocrDocument = _ocrEngine.DocumentManager.CreateDocument(Nothing, OcrCreateDocumentOptions.AutoDeleteFile) 
                     
             ' Scan the new page(s) 
             _twainSession.Acquire(TwainUserInterfaceFlags.Show) 
                     
             ' Save as PDF 
             _ocrDocument.Save(pdfFileName, DocumentFormat.Pdf, Nothing) 
                     
             ' Delete the document 
             _ocrDocument.Dispose() 
                     
             ' Show the result PDF file 
             System.Diagnostics.Process.Start(pdfFileName) 
          End Sub 

  14. Finally add the following code for the Twain acquire handle:

    C#
    VB
    private void _twainSession_AcquirePage(object sender, TwainAcquirePageEventArgs e) 
          { 
             // We have a page 
             RasterImage image = e.Image; 
                     
             // First, run the image processing commands on it 
             foreach (RasterCommand command in _imageProcessingCommands) 
             { 
                command.Run(image); 
             } 
                     
             // Create an OCR page for it 
             using (IOcrPage ocrPage = _ocrEngine.CreatePage(image, OcrImageSharingMode.AutoDispose)) 
             { 
                // Recognize it and add it to the document 
                ocrPage.Recognize(null); 
                     
                _ocrDocument.Pages.Add(ocrPage); 
             } 
          } 
    Private Sub _twainSession_AcquirePage(sender As Object, e As TwainAcquirePageEventArgs) 
             ' We have a page 
             Dim image As RasterImage = e.Image 
                     
             ' First, run the image processing commands on it 
             For Each command As RasterCommand In _imageProcessingCommands 
                command.Run(image) 
             Next 
                     
             ' Create an OCR page for it 
             Using ocrPage As IOcrPage = _ocrEngine.CreatePage(image, OcrImageSharingMode.AutoDispose) 
                ' Recognize it and add it to the document 
                ocrPage.Recognize(Nothing) 
                     
                _ocrDocument.Pages.Add(ocrPage) 
             End Using 
          End Sub 

  15. Build, and Run the program to test it.

Products | Support | Contact Us | Copyright Notices
© 1991-2017 LEAD Technologies, Inc. All Rights Reserved.
LEADTOOLS Imaging, Medical, and Document
Click or drag to resize