Platforms to show: All Mac Windows Linux Cross-Platform

/DynaPDF/Text extraction


Required plugins for this example: MBS DynaPDF Plugin

You find this example project in your Plugins Download as a Xojo project file within the examples folder: /DynaPDF/Text extraction

This example is the version from Thu, 31th Jul 2019.

Project "Text extraction.xojo_binary_project"
Class App Inherits Application
Const kEditClear = "&Löschen"
Const kFileQuit = "Beenden"
Const kFileQuitShortcut = ""
EventHandler Sub Open() dim f as FolderItem = GetOpenFolderItem(FileTypes1.pdf) if f=nil then quit dim timeStart as integer = ticks dim pdf as new MyDynapdfMBS pdf.SetLicenseKey "Pro" // For this example you can use a Pro or Enterprise License call pdf.CreateNewPDF(nil) // We do not produce a PDF file in this example! // External cmaps should always be loaded when extracting text from PDF files. // See the description of ParseContent() for further information. dim folder as FolderItem = FindFile("CMap") if folder<> nil then call pdf.SetCMapDir(folder, pdf.klcmRecursive) end if call pdf.SetImportFlags(BitwiseOr(pdf.kifImportAll, pdf.kifImportAsPage)) if pdf.OpenImportFile(f, pdf.kptOpen, "") < 0 then MsgBox "Failed to load input pdf"+EndOfLine+EndOfLine+f.NativePath quit end if // import all pages and close file call pdf.ImportPDFFile(1, 1.0, 1.0) call pdf.CloseImportFile // We flatten markup annotations and form fields so that we can extract the text from these objects too. call pdf.FlattenAnnots(pdf.kaffMarkupAnnots) call pdf.FlattenForm dim out as FolderItem = SpecialFolder.Desktop.Child(f.name+".txt") dim parser as new TextExtraction(out, pdf) dim count as integer = pdf.GetPageCount for i as integer = 1 to count call pdf.EditPage(i) // We write a page identifier to the file so that we know from which page comes the text. parser.WritePageIdentifier(i) parser.ParsePage call pdf.EndPage next timeStart = ticks - timeStart MsgBox str(timeStart/60)+" seconds needed." End EventHandler
Function FindFile(name as string) As FolderItem // Look for file in parent folders from executable on dim parent as FolderItem = app.ExecutableFile.Parent while parent<>Nil dim file as FolderItem = parent.Child(name) if file<>Nil and file.Exists then Return file end if parent = parent.Parent wend End Function
End Class
Class Window1 Inherits Window
End Class
MenuBar MenuBar1
MenuItem FileMenu = "&Ablage"
MenuItem FileQuit = "#App.kFileQuit"
MenuItem EditMenu = "&Bearbeiten"
MenuItem EditUndo = "&Rückgängig"
MenuItem UntitledMenu1 = "-"
MenuItem EditCut = "&Ausschneiden"
MenuItem EditCopy = "&Kopieren"
MenuItem EditPaste = "&Einfügen"
MenuItem EditClear = "#App.kEditClear"
MenuItem UntitledMenu0 = "-"
MenuItem EditSelectAll = "&Alles auswählen"
End MenuBar
Class TextExtraction
Const MAX_LINE_ERROR = 4.0
Enum TextDirection tfLeftToRight=0 tfRightToLeft=1 tfTopToBottom=2 tfBottomToTop=4 tfNotInitialized=-1 End Enum
Sub AddText() const space = " " dim textDir as TextDirection = TextDirection.tfNotInitialized dim x1 as double = 0.0 dim y1 as double = 0.0 dim x2 as double = 0.0 dim y2 as double = Stack.FontSize // Transform the text matrix to user space dim m as DynapdfMatrixMBS = MulMatrix(Stack.ctm, Stack.tm) // Start point of the text record Transform(m, x1, y1) // The second point to determine the text direction can also be used to calculate // the visible font size measured in user space: // double realFontSize = CalcDistance(x1, y1, x2, y2); Transform(m, x2, y2) // Determine the text direction if y1 = y2 then if x1>x2 then textdir = TextDirection.tfRightToLeft else textDir = TextDirection.tfLeftToRight end if else if y1>y2 then textDir = TextDirection.tfBottomToTop else textDir = TextDirection.tfTopToBottom end if end if // Wrong direction or not on the same text line? if textDir <> LastTextDir or false = IsPointOnLine(x1, y1, LastTextEndX, LastTextEndY, LastTextInfX, LastTextInfY) then // Extend the x-coordinate to an infinite point. LastTextInfX = 1000000.0 LastTextInfY = 0.0 Transform(m, LastTextInfX, LastTextInfY) if LastTextDir <> TextDirection.tfNotInitialized then // Add a new line to the output file out.WriteLine end if else // The space width is measured in text space but the distance between two text // records is measured in user space! We must transform the space width to user // space before we can compare the values. dim distance, spaceWidth as Double // Note that we use the full space width here because the end position of the last record // was set to the record width minus the half space width. dim x3 as double = Stack.SpaceWidth dim y3 as double = 0.0 Transform(m, x3, y3) spaceWidth = CalcDistance(x1, y1, x3 ,y3) distance = CalcDistance(LastTextEndX, LastTextEndY, x1, y1) if (distance > spaceWidth) then // Add a space to the output file out.Write space end if end if dim spaceWidth as double = -Stack.SpaceWidth * 0.5 for i as integer =0 to Stack.KerningCount-1 if stack.KerningAdvance(i) < spaceWidth then // Add a space to the output file Write space end if Write stack.KerningText(i) next // We don't set the cursor to the real end of the string because applications like MS Word // add often a space to the end of a text record and this space can slightly overlap the next // record. IsPointOnLine() would return false if the new record overlaps the previous one. LastTextEndX = Stack.TextWidth + spaceWidth // spaceWidth is a negative value! LastTextEndY = 0.0 LastTextDir = textDir // Calculate the end coordinate of the text record Transform m, LastTextEndX, LastTextEndY End Sub
Shared Function CalcDistance(x1 as double, y1 as Double, x2 as Double, y2 as Double) As Double dim dx as double = x2-x1 dim dy as double = y2-y1 return sqrt(dx*dx + dy*dy) End Function
Sub Constructor(file as FolderItem, pdf as DynaPDFMBS) me.pdf = pdf me.stack = new DynaPDFStackMBS LastTextEndX = 0.0 LastTextEndY = 0.0 LastTextDir = TextDirection.tfNotInitialized LastTextInfX = 0.0 LastTextInfY = 0.0 LastTextX = 0.0 LastTextY = 0.0 out = file.CreateTextFile End Sub
Shared Function IsPointOnLine(x as Double, y as Double, x0 as Double, y0 as Double, x1 as Double, y1 as Double) As Boolean dim dx, dy, di as Double x = x - x0 y = y - y0 dx = x1 - x0 dy = y1 - y0 di = (x*dx + y*dy) / (dx*dx + dy*dy) if di < 0.0 then di = 0.0 elseif di > 1.0 then di = 1.0 end if dx = x - di * dx dy = y - di * dy di = dx*dx + dy*dy return (di < MAX_LINE_ERROR) End Function
Shared Function MulMatrix(m1 as DynapdfMatrixMBS, m2 as DynapdfMatrixMBS) As DynapdfMatrixMBS dim r as new DynapdfMatrixMBS r.a = m2.a * m1.a + m2.b * m1.c r.b = m2.a * m1.b + m2.b * m1.d r.c = m2.c * m1.a + m2.d * m1.c r.d = m2.c * m1.b + m2.d * m1.d r.x = m2.x * m1.a + m2.y * m1.c + m1.x r.y = m2.x * m1.b + m2.y * m1.d + m1.y Return r End Function
Sub ParsePage() redim templates(-1) if pdf.InitStack(stack) = false then Return LastTextEndX = 0.0 LastTextEndY = 0.0 LastTextDir = TextDirection.tfNotInitialized LastTextInfX = 0.0 LastTextInfY = 0.0 LastTextX = 0.0 LastTextY = 0.0 ParseText ParseTemplates End Sub
Sub ParseTemplates() dim tmplCount as integer = pdf.GetTemplCount for i as integer = 0 to tmplCount-1 if pdf.EditTemplate(i) = false then Return dim template as integer = pdf.GetTemplHandle if templates.IndexOf(template) < 0 then templates.Append template if pdf.InitStack(stack) = false then Return ParseText // recursive for subtemplates dim tmplCount2 as integer = pdf.GetTemplCount for j as integer = 0 to tmplCount2-1 ParseTemplates next end if call pdf.EndTemplate next End Sub
Sub ParseText() // Get the first text record if any dim haveMore as Boolean = pdf.GetPageText(Stack) // No text found? if not haveMore and Stack.TextLen = 0 then return AddText if haveMore then while pdf.GetPageText(Stack) AddText wend end if End Sub
Shared Sub Transform(m as DynapdfMatrixMBS, byref x as Double, byref y as Double) dim tx as Double = x x = tx * M.a + y * M.c + M.x y = tx * M.b + y * M.d + M.y End Sub
Sub Write(t as string) out.Write ConvertEncoding(t, encodings.UTF8) End Sub
Sub WritePageIdentifier(pagenum as integer) if pagenum>1 then out.WriteLine end if out.WriteLine "%----------------------- Page "+str(Pagenum)+" -----------------------------" End Sub
Property LastTextDir As TextDirection
Property LastTextEndX As Double
Property LastTextEndY As Double
Property LastTextInfX As Double
Property LastTextInfY As Double
Property LastTextX As Double
Property LastTextY As Double
Property Templates() As Integer
Property out As TextOutputStream
Property pdf As DynaPDFMBS
Property stack As DynaPDFStackMBS
End Class
FileTypes1
Filetype application/pdf
End FileTypes1
Class MyDynaPDFMBS Inherits DynaPDFMBS
EventHandler Function Error(ErrorCode as integer, ErrorMessage as string, ErrorType as integer) As integer // output all messages on the console: System.DebugLog str(ErrorCode)+": "+ErrorMessage // and display dialog: Dim d as New MessageDialog //declare the MessageDialog object Dim b as MessageDialogButton //for handling the result d.icon=MessageDialog.GraphicCaution //display warning icon d.ActionButton.Caption="Continue" d.CancelButton.Visible=True //show the Cancel button // a warning or an error? if BitAnd(ErrorType, me.kE_WARNING) = me.kE_WARNING then // if user decided to ignore, we'll ignore if IgnoreWarnings then Return 0 d.Message="A warning occurred while processing your PDF code." // we add a third button to display all warnings d.AlternateActionButton.Caption = "Ignore warnings" d.AlternateActionButton.Visible = true else d.Message="An error occurred while processing your PDF code." end if d.Explanation = str(ErrorCode)+": "+ErrorMessage b=d.ShowModal //display the dialog Select Case b //determine which button was pressed. Case d.ActionButton Return 0 // ignore Case d.AlternateActionButton IgnoreWarnings = true Return 0 // ignore Case d.CancelButton Return -1 // stop End select End EventHandler
Property IgnoreWarnings As Boolean
End Class
End Project

See also:

The items on this page are in the following plugins: MBS DynaPDF Plugin.


The biggest plugin in space...