Welcome Guest! To enable all features, please Login or Register.

Notification

Icon
Error

Options
View
Last Go to last post Unread Go to first unread post
#1 Posted : Thursday, May 19, 2022 2:41:39 PM(UTC)

Amin  
Amin

Groups: Manager, Tech Support
Posts: 363


This C# code sample shows how to use the LEADDocument class to parse words from an input document then use the DocumentWriter class to split the words from each page of that document to two separate PDF files.
The first PDF file contains the words from the upper half of the page and the second PDF file contains the words from lower half of the page.

This means if the source file is named input.pdf and it has 2 pages, the output will be 4 PDF files as follows:

input.pdfpage1_Top.pdf _____ Upper half of page 1
input.pdfpage1_Bot.pdf _____ Lower half of page 1
input.pdfpage2_Top.pdf _____ Upper half of page 2
input.pdfpage2_Bot.pdf _____ Lower half of page 2

Code:
void ExtractDocumentText(string _documentFile)
{
   using (LEADDocument _document = DocumentFactory.LoadFromFile(_documentFile, new LoadDocumentOptions()))
   {
      IOcrEngine _ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.LEAD);
      _ocrEngine.Startup(null, null, null, @"C:\LEADTOOLS22\Bin\Common\OcrLEADRuntime");
      _document.Text.OcrEngine = _ocrEngine;

      foreach (DocumentPage _page in _document.Pages)
      {
         DocumentPageText _pageText = _page.GetText();
         _pageText.BuildWords();
         RasterCodecs infoCodecs = new RasterCodecs();
         int pageNumber = 1 + _document.Pages.IndexOf(_page); //pages are 1-based
         CodecsImageInfo pageInfo = infoCodecs.GetInformation(_documentFile, false, pageNumber);
         // we'll use 300 dpi
         double sizeAt300 = pageInfo.Height * 300.0 / pageInfo.YResolution;
         double sizeFactor = sizeAt300 / _page.Size.Height;
         SplitPageWords(_documentFile, pageNumber, sizeFactor, _pageText.Words);
      }
   }
}

void SplitPageWords(string inputFile, int pageNumber, double sizeFactor, IList<DocumentWord> words)
{
   RasterCodecs codecs = new RasterCodecs();
   codecs.Options.RasterizeDocument.Load.Resolution = 300;
   codecs.Options.Pdf.Load.DisplayDepth = 24;
   RasterImage img = codecs.Load(inputFile, pageNumber);

   // Image and metafile for top half of the page's words
   MemoryStream msTop = new MemoryStream();
   var gTempTop = CreateGraphics();
   Metafile mfTop = new Metafile(msTop, gTempTop.GetHdc());
   Graphics gEmfTop = Graphics.FromImage(mfTop);
   RasterImage imgTop = img.Clone();
   imgTop.AddRectangleToRegion(null, new LeadRect(0, 0, imgTop.Width, imgTop.Height), RasterRegionCombineMode.Set);
   gEmfTop.DrawRectangle(Pens.White, 0, 0, imgTop.Width, imgTop.Height);

   // Image and metafile for bottom half of the page's words
   MemoryStream msBot = new MemoryStream();
   var gTempBot = CreateGraphics();
   Metafile mfBot = new Metafile(msBot, gTempBot.GetHdc());
   Graphics gEmfBot = Graphics.FromImage(mfBot);
   RasterImage imgBot = img.Clone();
   imgBot.AddRectangleToRegion(null, new LeadRect(0, 0, imgBot.Width, imgBot.Height), RasterRegionCombineMode.Set);
   gEmfBot.DrawRectangle(Pens.White, 0, 0, imgBot.Width, imgBot.Height);


   foreach (var word in words)
   {
      LeadRect wordBounds = new LeadRect((int)(word.Bounds.X * sizeFactor), (int)(word.Bounds.Y * sizeFactor), (int)(word.Bounds.Width * sizeFactor), (int)(word.Bounds.Height * sizeFactor));
      Font font = SystemFonts.DefaultFont;
      float w0 = gEmfTop.MeasureString(word.Value, font).Width - 4f;
      float factor = wordBounds.Width / w0;
      font = new Font(font.FontFamily, font.Size * factor);
      if (wordBounds.Bottom < img.Height / 2)
      {
         gEmfTop.DrawString(word.Value, font, Brushes.White, wordBounds.X, wordBounds.Y);
         // Add the word rectangle to the "don't erase" region
         imgTop.AddRectangleToRegion(null, wordBounds, RasterRegionCombineMode.AndNotRegion);
      }
      else
      {
         gEmfBot.DrawString(word.Value, font, Brushes.White, wordBounds.X, wordBounds.Y);
         // Add the word rectangle to the "don't erase" region
         imgBot.AddRectangleToRegion(null, wordBounds, RasterRegionCombineMode.AndNotRegion);
      }
      font.Dispose();
   }
   // Fill the non-words area in both images with white to erase it
   FillCommand Fill = new FillCommand(RasterColor.White);
   Fill.Run(imgTop);
   gEmfTop.Dispose(); // finished drawing the text
   Fill.Run(imgBot);
   gEmfBot.Dispose(); // finished drawing the text

   // Create a PDF from the Top half data
   DocumentWriterEmfPage pageTop = new DocumentWriterEmfPage();
   pageTop.Image = imgTop;
   pageTop.EmfHandle = mfTop.GetHenhmetafile();
   DocumentWriter docWriterTop = new DocumentWriter();
   PdfDocumentOptions pdfOptionsTop = docWriterTop.GetOptions(DocumentFormat.Pdf) as PdfDocumentOptions;
   pdfOptionsTop.DocumentType = PdfDocumentType.PdfA;
   pdfOptionsTop.ImageOverText = true;
   pdfOptionsTop.DocumentResolution = imgTop.XResolution;
   pdfOptionsTop.EmptyPageResolution = imgTop.XResolution;
   docWriterTop.SetOptions(DocumentFormat.Pdf, pdfOptionsTop);
   docWriterTop.BeginDocument(inputFile + $"page{pageNumber}_Top.pdf", DocumentFormat.Pdf);
   docWriterTop.AddPage(pageTop);
   docWriterTop.EndDocument();
   mfTop.Dispose();
   gTempTop.ReleaseHdc();
   gTempTop.Dispose();

   // Create a PDF from the Bottom half data
   DocumentWriterEmfPage pageBot = new DocumentWriterEmfPage();
   pageBot.Image = imgBot;
   pageBot.EmfHandle = mfBot.GetHenhmetafile();
   DocumentWriter docWriterBot = new DocumentWriter();
   PdfDocumentOptions pdfOptionsBot = docWriterBot.GetOptions(DocumentFormat.Pdf) as PdfDocumentOptions;
   pdfOptionsBot.DocumentType = PdfDocumentType.PdfA;
   pdfOptionsBot.ImageOverText = true;
   pdfOptionsBot.DocumentResolution = imgBot.XResolution;
   pdfOptionsBot.EmptyPageResolution = imgBot.XResolution;
   docWriterBot.SetOptions(DocumentFormat.Pdf, pdfOptionsBot);
   docWriterBot.BeginDocument(inputFile + $"page{pageNumber}_Bot.pdf", DocumentFormat.Pdf);

   docWriterBot.AddPage(pageBot);
   docWriterBot.EndDocument();
   mfBot.Dispose();
   gTempBot.ReleaseHdc();
   gTempBot.Dispose();
}
Amin Dodin

Senior Support Engineer
LEAD Technologies, Inc.
LEAD Logo
 

Try the latest version of LEADTOOLS for free for 60 days by downloading the evaluation: https://www.leadtools.com/downloads

Wanna join the discussion? Login to your LEADTOOLS Support accountor Register a new forum account.

You cannot post new topics in this forum.
You cannot reply to topics in this forum.
You cannot delete your posts in this forum.
You cannot edit your posts in this forum.
You cannot create polls in this forum.
You cannot vote in polls in this forum.

Powered by YAF.NET | YAF.NET © 2003-2022, Yet Another Forum.NET
This page was generated in 0.029 seconds.