--- # Use the pdfbox parser, since it's the same one we used to originally etract the text to build this planning document. extractor: "pdf.pdfbox" # All measurements are in points. 1 point = 1/72 of an inch. # x-coordinates are from the left edge of the page. # y-coordinates are from the top edge of the page. header: # ignore anything less than this many points from the top, default and per-page default: 130 footer: # ignore anything less than this many points from the bottom, default and per-page default: 700 # Text segments are generally parsed in order, top to bottom, left to right. # If two text segments have y-coordinates within this many points, consider them on the same line, # and process the one further left first, even if it is 0.4pt lower on the page. maxRowDistance: 2 # Define the output data record. # Since the main record type we're collecting information on is our employees, # we'll have that be the root type for our harvested information. rootRecordType: employee recordTypes: employee: label: "employee" # Labels are used when nested recordTypes come into play, like this document. valueTypes: # Not sure what to name a valueType? Just make something up! - employee - name - hiredate - occupation - showinfo - bool1 - bool2 - bool3 - salary children: # In this example, there are multiple children nested under an employee, # so we'll treat it as a 'child' to the 'employee' recordType. - child child: label: "child" valueTypes: - child - grade valueTypes: employee: # In the CSV, use "Employee ID" as the column header instead of "employee". label: "Employee ID" name: label: "Name" hiredate: label: "Hire Date" occupation: label: "Occupation" salary: label: "Salary" showinfo: label: "Important Info?" bool1: label: "Boolean 1" bool2: label: "Boolean 2" bool3: label: "Boolean 3" child: label: "Attending Child" grade: label: "Grade" # Now we define the finite-state machine # Let's name the state that our machine starts off with: initialState: "INIT" # When each text segment is encountered, each transition for the current state is checked. states: INIT: transitions: # The first bit of text we reach is 'ID-0001', so we'll try the only transition that should work here. - # If this condition matches (which it should) condition: employee # Curious about the condition? Sxroll further down to the conditions section of this YAML. # Then we'll switch to the 'employee' state! nextState: employee employee: # ID number with the format 'ID-####' startRecord: true # When we enter this stage, we'll create a new "case" record. transitions: - # Now we move on to the name label. Once again, by varifying the condition and moving on after that. condition: namelabel nextState: namelabel namelabel: include: false # The label isn't important information in and of itself, so we can just not include it in the data. transitions: - condition: name nextState: name name: transitions: - # Sometimes a name will be in two segments, and we'll hit another 'name' text segment before anything else. # In that case, a state can transition to itself, compounding the information picked up in it. condition: name nextState: name - # Does the first condition not match the text? We move onto the next one. condition: hiredateLabel nextState: hiredateLabel hiredateLabel: include: false transitions: - condition: hiredateLabel nextState: hiredateLabel - condition: hiredate nextState: hiredate hiredate: transitions: - condition: occupationLabel nextState: occupationLabel occupationLabel: include: false transitions: - condition: occupation nextState: occupation occupation: transitions: - condition: occupation nextState: occupation - # This state and the next are an example of how you can choose, using conditions, what to include or not. # They share the same area of a document, but have qualities to them that can be distinguishable. # Does it meet 'showinfo' conditions? Then we go to the 'showinfo' state that includes it. condition: showinfo nextState: showinfo - # Doesn't meet 'showinfo'? Then check for 'notinfo' and exclude it. condition: notinfo nextState: notinfo showinfo: transitions: - condition: showinfo nextState: showinfo - condition: bool1 nextState: bool1 notinfo: include: false transitions: - condition: notinfo nextState: notinfo - condition: bool1 nextState: bool1 bool1: transitions: - condition: bool2 nextState: bool2 bool2: transitions: - condition: bool3 nextState: bool3 bool3: transitions: - condition: salaryLabel nextState: salaryLabel salaryLabel: include: false transitions: - condition: salary nextState: salary salary: transitions: - condition: childrenLabel nextState: childrenLabel - condition: employee nextState: employee - condition: end nextState: end childrenLabel: include: false transitions: - condition: childrenLabel nextState: childrenLabel - condition: childLabel nextState: childLabel childLabel: include: false transitions: - condition: child nextState: child child: # Here we reach a datatype nested within another datatype. We can start records using this child datatype. # In the process, we'll be making multiple rows for the parent datatype, each one holding onto it's own child. startRecord: true transitions: - condition: child nextState: child - condition: gradeLabel nextState: gradeLabel - condition: childLabel nextState: childLabel gradeLabel: include: false transitions: - # Normally, there would always been an instance of a grade appearing right after the label. # But in this document, we have one instance of that not happening under ID-0007's child. condition: grade nextState: grade - # So we just account for that possibility by adding a transition out of the label. condition: employee nextState: employee grade: transitions: - condition: employee nextState: employee - condition: childLabel nextState: childLabel - # Reach the end of the usable info in a document, but there's still text left to go? # An easy fix is to just create a looping, not-included state to finish the document off. condition: end nextState: end end: # We reached a point in the document where all the useful information is gone, but we still have text to go. include: false transitions: - # By using an always-true condition such as 'any', we can loop this state until the document has been completely gone through. condition: any nextState: end # Here we define the conditions: conditions: # An example of comparing text with regex. # In this case, we're making sure that the text contains the characters 'ID-' followed by any amount of numbers. employee: 'text =~ /ID-(\\d)*/' # You can match based on the x- and y- coordinates of the upper left and lower right corners of the rectangle # containing the text. ulx = Upper-Left X-coordinate. lry = Lower-Right Y-coordinate. Also uly and lrx. # You can define the lower and upper limit for each, inclusive. namelabel: '70 < ulx < 80 and font = "BCDFEE+Calibri-Bold"' # You can also match based on the type of font used, including if it was bolded or italicized. name: '112 < ulx < 200 and font = "BCDEEE+Calibri"' hiredateLabel: '230 < ulx < 270 and font = "BCDFEE+Calibri-Bold"' hiredate: '280 < ulx < 290 and font = "BCDEEE+Calibri"' occupationLabel: '391 < ulx < 393 and font = "BCDFEE+Calibri-Bold"' occupation: '394 < ulx < 700 and font = "BCDEEE+Calibri"' showinfo: 'font = "BCDJEE+Georgia"' notinfo: 'font = "BCDEEE+Calibri"' bool1: 'font = "BCDIEE+Cambria"' bool2: 'font = "BCDIEE+Cambria"' bool3: 'font = "BCDIEE+Cambria"' salaryLabel: '391 < ulx < 393 and font = "BCDFEE+Calibri-Bold"' salary: '394 < ulx < 700 and font = "BCDEEE+Calibri"' childrenLabel: '70 < ulx < 140 and font = "BCDFEE+Calibri-Bold" and text =~ /(Attending)|(Children:)/' childLabel: '230 < ulx < 240 and font = "BCDFEE+Calibri-Bold"' child: '230 < ulx < 380 and font = "BCDEEE+Calibri"' gradeLabel: '391 < ulx < 393 and font = "BCDFEE+Calibri-Bold"' grade: '394 < ulx < 700 and font = "BCDEEE+Calibri"' # You can also match based on the size of the font and on specific text. end: 'fontSize = 16.0 and text = "TOTAL:"' # Need a condition that is always true? "1=1" does that for you. any: "1 = 1"