Platforms to show: All Mac Windows Linux Cross-Platform
Required plugins for this example: MBS DynaPDF Plugin
You find this example project in your Plugins Download as a Xojo project file within the examples folder: /DynaPDF/Text extraction
This example is the version from Thu, 31th Jul 2019.
Project "Text extraction.xojo_binary_project"
Class App Inherits Application
Const kEditClear = "&Löschen"
Const kFileQuit = "Beenden"
Const kFileQuitShortcut = ""
EventHandler Sub Open()
dim f as FolderItem = GetOpenFolderItem(FileTypes1.pdf)
if f=nil then quit
dim timeStart as integer = ticks
dim pdf as new MyDynapdfMBS
pdf.SetLicenseKey "Pro" // For this example you can use a Pro or Enterprise License
call pdf.CreateNewPDF(nil) // We do not produce a PDF file in this example!
// External cmaps should always be loaded when extracting text from PDF files.
// See the description of ParseContent() for further information.
dim folder as FolderItem = FindFile("CMap")
if folder<> nil then
call pdf.SetCMapDir(folder, pdf.klcmRecursive)
end if
call pdf.SetImportFlags(BitwiseOr(pdf.kifImportAll, pdf.kifImportAsPage))
if pdf.OpenImportFile(f, pdf.kptOpen, "") < 0 then
MsgBox "Failed to load input pdf"+EndOfLine+EndOfLine+f.NativePath
quit
end if
// import all pages and close file
call pdf.ImportPDFFile(1, 1.0, 1.0)
call pdf.CloseImportFile
// We flatten markup annotations and form fields so that we can extract the text from these objects too.
call pdf.FlattenAnnots(pdf.kaffMarkupAnnots)
call pdf.FlattenForm
dim out as FolderItem = SpecialFolder.Desktop.Child(f.name+".txt")
dim parser as new TextExtraction(out, pdf)
dim count as integer = pdf.GetPageCount
for i as integer = 1 to count
call pdf.EditPage(i)
// We write a page identifier to the file so that we know from which page comes the text.
parser.WritePageIdentifier(i)
parser.ParsePage
call pdf.EndPage
next
timeStart = ticks - timeStart
MsgBox str(timeStart/60)+" seconds needed."
End EventHandler
Function FindFile(name as string) As FolderItem
// Look for file in parent folders from executable on
dim parent as FolderItem = app.ExecutableFile.Parent
while parent<>Nil
dim file as FolderItem = parent.Child(name)
if file<>Nil and file.Exists then
Return file
end if
parent = parent.Parent
wend
End Function
End Class
Class Window1 Inherits Window
End Class
MenuBar MenuBar1
MenuItem FileMenu = "&Ablage"
MenuItem FileQuit = "#App.kFileQuit"
MenuItem EditMenu = "&Bearbeiten"
MenuItem EditUndo = "&Rückgängig"
MenuItem UntitledMenu1 = "-"
MenuItem EditCut = "&Ausschneiden"
MenuItem EditCopy = "&Kopieren"
MenuItem EditPaste = "&Einfügen"
MenuItem EditClear = "#App.kEditClear"
MenuItem UntitledMenu0 = "-"
MenuItem EditSelectAll = "&Alles auswählen"
End MenuBar
Class TextExtraction
Const MAX_LINE_ERROR = 4.0
Enum TextDirection
tfLeftToRight=0
tfRightToLeft=1
tfTopToBottom=2
tfBottomToTop=4
tfNotInitialized=-1
End Enum
Sub AddText()
const space = " "
dim textDir as TextDirection = TextDirection.tfNotInitialized
dim x1 as double = 0.0
dim y1 as double = 0.0
dim x2 as double = 0.0
dim y2 as double = Stack.FontSize
// Transform the text matrix to user space
dim m as DynapdfMatrixMBS = MulMatrix(Stack.ctm, Stack.tm)
// Start point of the text record
Transform(m, x1, y1)
// The second point to determine the text direction can also be used to calculate
// the visible font size measured in user space:
// double realFontSize = CalcDistance(x1, y1, x2, y2);
Transform(m, x2, y2)
// Determine the text direction
if y1 = y2 then
if x1>x2 then
textdir = TextDirection.tfRightToLeft
else
textDir = TextDirection.tfLeftToRight
end if
else
if y1>y2 then
textDir = TextDirection.tfBottomToTop
else
textDir = TextDirection.tfTopToBottom
end if
end if
// Wrong direction or not on the same text line?
if textDir <> LastTextDir or false = IsPointOnLine(x1, y1, LastTextEndX, LastTextEndY, LastTextInfX, LastTextInfY) then
// Extend the x-coordinate to an infinite point.
LastTextInfX = 1000000.0
LastTextInfY = 0.0
Transform(m, LastTextInfX, LastTextInfY)
if LastTextDir <> TextDirection.tfNotInitialized then
// Add a new line to the output file
out.WriteLine
end if
else
// The space width is measured in text space but the distance between two text
// records is measured in user space! We must transform the space width to user
// space before we can compare the values.
dim distance, spaceWidth as Double
// Note that we use the full space width here because the end position of the last record
// was set to the record width minus the half space width.
dim x3 as double = Stack.SpaceWidth
dim y3 as double = 0.0
Transform(m, x3, y3)
spaceWidth = CalcDistance(x1, y1, x3 ,y3)
distance = CalcDistance(LastTextEndX, LastTextEndY, x1, y1)
if (distance > spaceWidth) then
// Add a space to the output file
out.Write space
end if
end if
dim spaceWidth as double = -Stack.SpaceWidth * 0.5
for i as integer =0 to Stack.KerningCount-1
if stack.KerningAdvance(i) < spaceWidth then
// Add a space to the output file
Write space
end if
Write stack.KerningText(i)
next
// We don't set the cursor to the real end of the string because applications like MS Word
// add often a space to the end of a text record and this space can slightly overlap the next
// record. IsPointOnLine() would return false if the new record overlaps the previous one.
LastTextEndX = Stack.TextWidth + spaceWidth // spaceWidth is a negative value!
LastTextEndY = 0.0
LastTextDir = textDir
// Calculate the end coordinate of the text record
Transform m, LastTextEndX, LastTextEndY
End Sub
Shared Function CalcDistance(x1 as double, y1 as Double, x2 as Double, y2 as Double) As Double
dim dx as double = x2-x1
dim dy as double = y2-y1
return sqrt(dx*dx + dy*dy)
End Function
Sub Constructor(file as FolderItem, pdf as DynaPDFMBS)
me.pdf = pdf
me.stack = new DynaPDFStackMBS
LastTextEndX = 0.0
LastTextEndY = 0.0
LastTextDir = TextDirection.tfNotInitialized
LastTextInfX = 0.0
LastTextInfY = 0.0
LastTextX = 0.0
LastTextY = 0.0
out = file.CreateTextFile
End Sub
Shared Function IsPointOnLine(x as Double, y as Double, x0 as Double, y0 as Double, x1 as Double, y1 as Double) As Boolean
dim dx, dy, di as Double
x = x - x0
y = y - y0
dx = x1 - x0
dy = y1 - y0
di = (x*dx + y*dy) / (dx*dx + dy*dy)
if di < 0.0 then
di = 0.0
elseif di > 1.0 then
di = 1.0
end if
dx = x - di * dx
dy = y - di * dy
di = dx*dx + dy*dy
return (di < MAX_LINE_ERROR)
End Function
Shared Function MulMatrix(m1 as DynapdfMatrixMBS, m2 as DynapdfMatrixMBS) As DynapdfMatrixMBS
dim r as new DynapdfMatrixMBS
r.a = m2.a * m1.a + m2.b * m1.c
r.b = m2.a * m1.b + m2.b * m1.d
r.c = m2.c * m1.a + m2.d * m1.c
r.d = m2.c * m1.b + m2.d * m1.d
r.x = m2.x * m1.a + m2.y * m1.c + m1.x
r.y = m2.x * m1.b + m2.y * m1.d + m1.y
Return r
End Function
Sub ParsePage()
redim templates(-1)
if pdf.InitStack(stack) = false then Return
LastTextEndX = 0.0
LastTextEndY = 0.0
LastTextDir = TextDirection.tfNotInitialized
LastTextInfX = 0.0
LastTextInfY = 0.0
LastTextX = 0.0
LastTextY = 0.0
ParseText
ParseTemplates
End Sub
Sub ParseTemplates()
dim tmplCount as integer = pdf.GetTemplCount
for i as integer = 0 to tmplCount-1
if pdf.EditTemplate(i) = false then Return
dim template as integer = pdf.GetTemplHandle
if templates.IndexOf(template) < 0 then
templates.Append template
if pdf.InitStack(stack) = false then Return
ParseText
// recursive for subtemplates
dim tmplCount2 as integer = pdf.GetTemplCount
for j as integer = 0 to tmplCount2-1
ParseTemplates
next
end if
call pdf.EndTemplate
next
End Sub
Sub ParseText()
// Get the first text record if any
dim haveMore as Boolean = pdf.GetPageText(Stack)
// No text found?
if not haveMore and Stack.TextLen = 0 then return
AddText
if haveMore then
while pdf.GetPageText(Stack)
AddText
wend
end if
End Sub
Shared Sub Transform(m as DynapdfMatrixMBS, byref x as Double, byref y as Double)
dim tx as Double = x
x = tx * M.a + y * M.c + M.x
y = tx * M.b + y * M.d + M.y
End Sub
Sub Write(t as string)
out.Write ConvertEncoding(t, encodings.UTF8)
End Sub
Sub WritePageIdentifier(pagenum as integer)
if pagenum>1 then
out.WriteLine
end if
out.WriteLine "%----------------------- Page "+str(Pagenum)+" -----------------------------"
End Sub
Property LastTextDir As TextDirection
Property LastTextEndX As Double
Property LastTextEndY As Double
Property LastTextInfX As Double
Property LastTextInfY As Double
Property LastTextX As Double
Property LastTextY As Double
Property Templates() As Integer
Property out As TextOutputStream
Property pdf As DynaPDFMBS
Property stack As DynaPDFStackMBS
End Class
FileTypes1
Filetype application/pdf
End FileTypes1
Class MyDynaPDFMBS Inherits DynaPDFMBS
EventHandler Function Error(ErrorCode as integer, ErrorMessage as string, ErrorType as integer) As integer
// output all messages on the console:
System.DebugLog str(ErrorCode)+": "+ErrorMessage
// and display dialog:
Dim d as New MessageDialog //declare the MessageDialog object
Dim b as MessageDialogButton //for handling the result
d.icon=MessageDialog.GraphicCaution //display warning icon
d.ActionButton.Caption="Continue"
d.CancelButton.Visible=True //show the Cancel button
// a warning or an error?
if BitAnd(ErrorType, me.kE_WARNING) = me.kE_WARNING then
// if user decided to ignore, we'll ignore
if IgnoreWarnings then Return 0
d.Message="A warning occurred while processing your PDF code."
// we add a third button to display all warnings
d.AlternateActionButton.Caption = "Ignore warnings"
d.AlternateActionButton.Visible = true
else
d.Message="An error occurred while processing your PDF code."
end if
d.Explanation = str(ErrorCode)+": "+ErrorMessage
b=d.ShowModal //display the dialog
Select Case b //determine which button was pressed.
Case d.ActionButton
Return 0 // ignore
Case d.AlternateActionButton
IgnoreWarnings = true
Return 0 // ignore
Case d.CancelButton
Return -1 // stop
End select
End EventHandler
Property IgnoreWarnings As Boolean
End Class
End Project
See also:
- /DynaPDF/text editing/delete text
- /DynaPDF/text editing/edit text
- /DynaPDF/text editing/replace text
- /DynaPDF/text formatting/text formatting Ansi
- /DynaPDF/text formatting/text formatting parallel
- /DynaPDF/text formatting/text formatting Unicode
- /DynaPDF/Text Positions
- /DynaPDF/Text Positions with parser
The items on this page are in the following plugins: MBS DynaPDF Plugin.