Scan to Searchable PDF

Take the following steps to create and run a program that shows how scan a document and convert it to a searchable PDF file.

  1. Start Visual Studio

  2. Choose File->New->Project from the menu

  3. In the New Project dialog box, choose either "Visual C# Projects" or "VB Projects" in the Projects Type List, and choose "Windows Application" in Visual Studio 2005 or "Windows Forms Application" in Visual Studio 2008 from the Templates List

  4. Type the project name as "OcrTutorial3" in the Project Name field, and then choose OK. If desired, type a new location for your project or select a directory using the Browse button, and then choose OK.

  5. In the "Solution Explorer" window, right-click on the "References" folder, and select "Add Reference..." from the context menu. In the "Add Reference" dialog box, select the ".NET" tab and browse to LEADTOOLS For .NET "<LEADTOOLS_INSTALLDIR>\Bin\DotNet\Win32" folder and select the following DLLs:

    • Leadtools.dll
    • Leadtools.Codecs.dll
    • Leadtools.Forms.dll
    • Leadtools.Forms.DocumentWriters.dll
    • Leadtools.Forms.Ocr.dll
    • Leadtools.Forms.Ocr.Professional.dll
    • Leadtools.Twain.dll
    • Leadtools.ImageProcessing.Core.dll
    • Leadtools.Codecs.Bmp.dll
    • Leadtools.Codecs.Cmp.dll
    • Leadtools.Codecs.Tif.dll
    • Leadtools.Codecs.Fax.dll

    Note: The Leadtools.Codecs.*.dll references added are for the BMP, JPG, CMP, TIF and FAX image file formats. Add any additional file format codec DLL if required in your application.

  6. Drag and drop three buttons in Form1. Leave all the buttons names as the default "button1, button2 ...", then change the Text property of each button to the following:

    ButtonTextbutton1Change output directorybutton2Select the Scannerbutton3Scan and OCR

  7. Switch to Form1 code view (Right-click Form1 in the solution explorer then select View Code) and add the following lines at the beginning of the file after any Importsor usingsection if there are any:

    C#
    VB
    Imports Leadtools 
    Imports Leadtools.Codecs 
    Imports Leadtools.Twain 
    Imports Leadtools.Forms 
    Imports Leadtools.Forms.DocumentWriters 
    Imports Leadtools.Forms.Ocr 
    Imports Leadtools.ImageProcessing.Core 
    using Leadtools; 
    using Leadtools.Codecs; 
    using Leadtools.Twain; 
    using Leadtools.Forms; 
    using Leadtools.Forms.DocumentWriters; 
    using Leadtools.Forms.Ocr; 
    using Leadtools.ImageProcessing.Core; 

  8. Add the following private variables to the Form1 class:

    C#
    VB
    ' The OCR engine instance 
    Private _ocrEngine As IOcrEngine 
    ' The OCR document 
    Private _ocrDocument As IOcrDocument 
    ' The Twain session 
    Private _twainSession As TwainSession 
    ' The output directory for saving PDF files 
    Private _outputDirectory As String = "C:\MyImages" 
    ' The image processing commands we are going to use to clean the scanned image 
    Private deskewCmd As DeskewCommand 
    Private despeckleCmd As DespeckleCommand 
    Private dotRemoveCmd As DotRemoveCommand 
    Private holePunchRemoveCmd As HolePunchRemoveCommand 
    Private lineRemoveCmd As LineRemoveCommand 
    // The OCR engine instance 
    private IOcrEngine _ocrEngine; 
    // The OCR document 
    private IOcrDocument _ocrDocument; 
    // The Twain session 
    private TwainSession _twainSession; 
    // The output directory for saving PDF files 
    private string _outputDirectory = @"C:\MyImages"; 
    // The image processing commands we are going to use to clean the scanned image 
    private DeskewCommand deskewCmd; 
    private DespeckleCommand despeckleCmd; 
    private DotRemoveCommand dotRemoveCmd; 
    private HolePunchRemoveCommand holePunchRemoveCmd; 
    private LineRemoveCommand lineRemoveCmd; 

  9. Add the following code to the Form1 constructor (in VB, you can copy/paste the whole Sub New code from here):

    C#
    VB
    Sub New() 
       ' This call is required by the Windows Form Designer. 
       InitializeComponent() 
       ' Add any initialization after the InitializeComponent() call. 
       Dim MY_LICENSE_FILE As String = "d:\temp\TestLic.lic" 
       ' Unlock the OCR support 
       Dim MY_OCRPRODEVELOPER_KEY As String = "xyz123abc" 
       RasterSupport.SetLicense(MY_LICENSE_FILE, MY_OCRPRODEVELOPER_KEY) 
       ' Unlock the PDF save support 
       Dim MY_OCRPDFDEVELOPER_KEY As String = "abc123xyz" 
       RasterSupport.SetLicense(MY_LICENSE_FILE, MY_OCRPDFDEVELOPER_KEY) 
       ' Unlock Document support 
       Dim MY_DOCDEVELOPER_KEY As String = "123xyzabc" 
       RasterSupport.SetLicense(MY_LICENSE_FILE, MY_DOCDEVELOPER_KEY) 
       ' Initialize the OCR engine 
       _ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Advantage, False) 
       ' Startup the engine 
       _ocrEngine.Startup(Nothing, Nothing, Nothing, "C:\LEADTOOLS 19\Bin\Common\OcrAdvantageRuntime") 
       ' Create the OCR document 
       _ocrDocument = _ocrEngine.DocumentManager.CreateDocument() 
       ' Initalize Twain scanning session 
       _twainSession = New TwainSession() 
       _twainSession.Startup(Me, "My Company", "My Product", "My Version", "My Application", TwainStartupFlags.None) 
       ' Subscribe to the TwainSession.Acquire event to get the image 
       AddHandler _twainSession.AcquirePage, AddressOf _twainSession_AcquirePage 
       ' Initialize the image processing commands we are going to use 
       ' Initialize Deskew 
       deskewCmd = New DeskewCommand() 
       ' Initialize Despeckle 
       despeckleCmd = New DespeckleCommand() 
       ' Initialize DotRemove 
       dotRemoveCmd = New DotRemoveCommand() 
       dotRemoveCmd.Flags = _ 
          DotRemoveCommandFlags.UseDiagonals Or _ 
          DotRemoveCommandFlags.UseSize 
       dotRemoveCmd.MaximumDotHeight = 8 
       dotRemoveCmd.MaximumDotWidth = 8 
       dotRemoveCmd.MinimumDotHeight = 2 
       dotRemoveCmd.MinimumDotWidth = 2 
       ' Initialize HolePunchRemove 
       holePunchRemoveCmd = New HolePunchRemoveCommand() 
       holePunchRemoveCmd.Flags = _ 
          HolePunchRemoveCommandFlags.UseDpi Or _ 
          HolePunchRemoveCommandFlags.UseCount Or _ 
          HolePunchRemoveCommandFlags.UseLocation 
       holePunchRemoveCmd.Location = HolePunchRemoveCommandLocation.Left 
       ' Initialize LineRemove 
       lineRemoveCmd = New LineRemoveCommand() 
       lineRemoveCmd.MaximumLineWidth = 9 
       lineRemoveCmd.MinimumLineLength = 400 
       lineRemoveCmd.Wall = 15 
       lineRemoveCmd.MaximumWallPercent = 10 
       lineRemoveCmd.Variance = 3 
       lineRemoveCmd.GapLength = 3 
    End Sub 
    public Form1() 
    { 
       InitializeComponent(); 
       // Unlock the OCR support 
       string MY_LICENSE_FILE = "d:\\temp\\TestLic.lic"; 
       string MY_OCRPRODEVELOPER_KEY = "xyz123abc"; 
       RasterSupport.SetLicense(MY_LICENSE_FILE, MY_OCRPRODEVELOPER_KEY); 
       // Unlock the PDF save support 
       string MY_OCRPDFDEVELOPER_KEY = "abc123xyz"; 
       RasterSupport.SetLicense(MY_LICENSE_FILE, MY_OCRPDFDEVELOPER_KEY); 
       // Unlock Document support 
       string MY_DOCDEVELOPER_KEY = "123xyzabc"; 
       RasterSupport.SetLicense(MY_LICENSE_FILE, MY_DOCDEVELOPER_KEY); 
       // Initialize the OCR engine 
       _ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Advantage, false); 
       // Startup the engine 
       _ocrEngine.Startup(null, null, null, @"C:\LEADTOOLS 19\Bin\Common\OcrAdvantageRuntime"); 
       // Create the OCR document 
       _ocrDocument = _ocrEngine.DocumentManager.CreateDocument(); 
       // Initalize Twain scanning session 
       _twainSession = new TwainSession(); 
       _twainSession.Startup(this, "My Company", "My Product", "My Version", "My Application", TwainStartupFlags.None); 
       // Subscribe to the TwainSession.Acquire event to get the image 
       _twainSession.AcquirePage += new EventHandler<TwainAcquirePageEventArgs>(_twainSession_AcquirePage); 
       // Initialize the image processing commands we are going to use 
       // Initialize Deskew 
       deskewCmd = new DeskewCommand(); 
       // Initialize Despeckle 
       despeckleCmd = new DespeckleCommand(); 
       // Initialize DotRemove 
       dotRemoveCmd = new DotRemoveCommand(); 
       dotRemoveCmd.Flags = 
          DotRemoveCommandFlags.UseDiagonals | 
          DotRemoveCommandFlags.UseSize; 
       dotRemoveCmd.MaximumDotHeight = 8; 
       dotRemoveCmd.MaximumDotWidth = 8; 
       dotRemoveCmd.MinimumDotHeight = 2; 
       dotRemoveCmd.MinimumDotWidth = 2; 
        // Initialize HolePunchRemove 
       holePunchRemoveCmd = new HolePunchRemoveCommand(); 
       holePunchRemoveCmd.Flags = 
          HolePunchRemoveCommandFlags.UseDpi | 
          HolePunchRemoveCommandFlags.UseCount | 
          HolePunchRemoveCommandFlags.UseLocation; 
       holePunchRemoveCmd.Location = HolePunchRemoveCommandLocation.Left; 
       // Initialize LineRemove 
       lineRemoveCmd = new LineRemoveCommand(); 
       lineRemoveCmd.MaximumLineWidth = 9; 
       lineRemoveCmd.MinimumLineLength = 400; 
       lineRemoveCmd.Wall = 15; 
       lineRemoveCmd.MaximumWallPercent = 10; 
       lineRemoveCmd.Variance = 3; 
       lineRemoveCmd.GapLength = 3; 
    } 

  10. Override the Form1 closed event to add the code necessary to shutdown the OCR engine when the application terminates:

    C#
    VB
    Protected Overrides Sub OnFormClosed(ByVal e As FormClosedEventArgs) 
       ' Destroy the OCR document 
       _ocrDocument.Dispose() 
       ' Shutdown and dispose the OCR engine 
       _ocrEngine.Dispose() 
       ' Close the Twain session 
       _twainSession.Shutdown() 
       MyBase.OnFormClosed(e) 
    End Sub 
    protected override void OnFormClosed(FormClosedEventArgs e) 
    { 
       // Destroy the OCR document 
       _ocrDocument.Dispose(); 
       // Shutdown and dispose the OCR engine 
       _ocrEngine.Dispose(); 
       // Close the Twain session 
       _twainSession.Shutdown(); 
       base.OnFormClosed(e); 
    } 

  11. Add the following code for the button1 (Change output directory) control’s Clickhandler:

    C#
    VB
    Private Sub button1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles button1.Click 
       ' Change the output directory 
       Dim dlg As New FolderBrowserDialog() 
       dlg.SelectedPath = _outputDirectory 
       dlg.ShowNewFolderButton = True 
       If (dlg.ShowDialog(Me) = DialogResult.OK) Then 
          _outputDirectory = System.IO.Path.GetFullPath(dlg.SelectedPath) 
       End If 
    End Sub 
    private void button1_Click(object sender, EventArgs e) 
    { 
       // Change the output directory 
       FolderBrowserDialog dlg = new FolderBrowserDialog(); 
       dlg.SelectedPath = _outputDirectory; 
       dlg.ShowNewFolderButton = true; 
       if(dlg.ShowDialog(this) == DialogResult.OK) 
          _outputDirectory = System.IO.Path.GetFullPath(dlg.SelectedPath); 
    } 

  12. Add the following code for the button2 (Select the Scanner) control’s Clickhandler:

    C#
    VB
    Private Sub button2_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles button2.Click 
       ' Select the scanner to use 
       _twainSession.SelectSource(Nothing) 
    End Sub 
    private void button2_Click(object sender, EventArgs e) 
    { 
       // Select the scanner to use 
       _twainSession.SelectSource(null); 
    } 

  13. Add the following code for the button3 (Scan and OCR) control’s Clickhandler:

    C#
    VB
    Private Sub button3_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles button3.Click 
       ' Create the output directory if it does not exist 
       If (Not System.IO.Directory.Exists(_outputDirectory)) Then 
          System.IO.Directory.CreateDirectory(_outputDirectory) 
       End If 
       ' Build the output PDF file name 
       Dim pdfFileName As String = System.IO.Path.Combine(_outputDirectory, "Scanned.pdf") 
       ' First remove all the pages added to the OCR document 
       _ocrDocument.Pages.Clear() 
       ' Scan the new page(s) 
       _twainSession.Acquire(TwainUserInterfaceFlags.Show) 
       ' The pages should be added to the OCR document now. 
       ' Recognize and save as PDF 
       _ocrDocument.Pages.Recognize(Nothing) 
       _ocrDocument.Save(pdfFileName, DocumentFormat.Pdf, Nothing) 
       ' Show the result PDF file 
       System.Diagnostics.Process.Start(pdfFileName) 
    End Sub 
    private void button3_Click(object sender, EventArgs e) 
    { 
       // Create the output directory if it does not exist 
       if(!System.IO.Directory.Exists(_outputDirectory)) 
          System.IO.Directory.CreateDirectory(_outputDirectory); 
       // Build the output PDF file name 
       string pdfFileName = System.IO.Path.Combine(_outputDirectory, "Scanned.pdf"); 
       // First remove all the pages added to the OCR document 
       _ocrDocument.Pages.Clear(); 
       // Scan the new page(s) 
       _twainSession.Acquire(TwainUserInterfaceFlags.Show); 
       // The pages should be added to the OCR document now. 
       // Recognize and save as PDF 
       _ocrDocument.Pages.Recognize(null); 
       _ocrDocument.Save(pdfFileName, DocumentFormat.Pdf, null); 
       // Show the result PDF file 
       System.Diagnostics.Process.Start(pdfFileName); 
    } 

  14. Add the private method to handle the AcquirePageevent of the TwainSessionobject:

    C#
    VB
    Private Sub _twainSession_AcquirePage(ByVal sender As Object, ByVal e As TwainAcquirePageEventArgs) 
       ' We have a page 
       Dim image As RasterImage = e.Image 
       ' First, run the image processing commands on it 
       ' Deskew 
       deskewCmd.Run(image) 
       ' Despeckle 
       despeckleCmd.Run(image) 
       ' The rest of the commands only work on 1 BPP image 
       If (image.BitsPerPixel = 1) Then 
          ' Dot Remove 
          dotRemoveCmd.Run(image) 
          ' Hole Punch Remove 
          holePunchRemoveCmd.Run(image) 
          ' Vertical Line Remove 
          lineRemoveCmd.Type = LineRemoveCommandType.Vertical 
          lineRemoveCmd.Run(image) 
          ' Horizontal Line Remove 
          lineRemoveCmd.Type = LineRemoveCommandType.Horizontal 
          lineRemoveCmd.Run(image) 
       End If 
       ' Add the image as a new page to the OCR document 
       _ocrDocument.Pages.AddPage(image, Nothing) 
    End Sub 
    private void _twainSession_AcquirePage(object sender, TwainAcquirePageEventArgs e) 
    { 
       // We have a page 
       RasterImage image = e.Image; 
       // First, run the image processing commands on it 
       // Deskew 
       deskewCmd.Run(image); 
       // Despeckle 
       despeckleCmd.Run(image); 
       // The rest of the commands only work on 1 BPP image 
       if(image.BitsPerPixel == 1) 
       { 
          // Dot Remove 
          dotRemoveCmd.Run(image); 
          // Hole Punch Remove 
          holePunchRemoveCmd.Run(image); 
          // Vertical Line Remove 
          lineRemoveCmd.Type = LineRemoveCommandType.Vertical; 
          lineRemoveCmd.Run(image); 
          // Horizontal Line Remove 
          lineRemoveCmd.Type = LineRemoveCommandType.Horizontal; 
          lineRemoveCmd.Run(image); 
       } 
       // Add the image as a new page to the OCR document 
       _ocrDocument.Pages.AddPage(image, null); 
    } 

  15. Build, and Run the program to test it. You can click the buttons in the following order to create the PDF file "Startup", "Add Page", "Save and Recognize", "Shutdown".
Products | Support | Contact Us | Copyright Notices
© 1991-2017 LEAD Technologies, Inc. All Rights Reserved.
LEADTOOLS Imaging, Medical, and Document
Click or drag to resize