How To Extract Text From a Document in PDF Mosaic Library .NET

This sample shows how to extract text from a document.

Use PDFPage.GetText() method to extract text in plain text format.

C# :

using PDFMosaic;
using System.Drawing;
using System.IO;
using System.Diagnostics;

namespace ExtractText
{
  class ExtractText
  {
    static void Main()
    {
      PDFDocument document = new PDFDocument("..\\..\\residential.pdf");

      StreamWriter writer = new StreamWriter("Document text.txt");
      for (int i = 0; i < document.Pages.Count; ++i)
        writer.WriteLine(document.Pages[i].GetText());

      writer.Close();

      document.Save("ExtractText.pdf", true);
      Process.Start("Document text.txt");
    }
  }
}

VB.NET :

Imports PDFMosaic
Imports System.Drawing
Imports System.IO
Imports System.Diagnostics

Module ExtractText
  Sub Main()
    Dim document As New PDFDocument("..\\..\\residential.pdf")

    Dim writer As New StreamWriter("Document text.txt")
    For i As Integer = 0 To document.Pages.Count - 1
      writer.WriteLine(document.Pages(i).GetText())
    Next

    writer.Close()

    document.Save("ExtractText.pdf", True)
    Process.Start("Document text.txt")
  End Sub
End Module