---
# Use the pdfbox parser, since it's the same one we used to originally etract the text to build this planning document.
extractor: "pdf.pdfbox"

# All measurements are in points. 1 point = 1/72 of an inch.
# x-coordinates are from the left edge of the page.
# y-coordinates are from the top edge of the page.
header:
    # ignore anything less than this many points from the top, default and per-page
  default: 130
footer:
    # ignore anything less than this many points from the bottom, default and per-page
  default: 700

# Text segments are generally parsed in order, top to bottom, left to right.
# If two text segments have y-coordinates within this many points, consider them on the same line,
# and process the one further left first, even if it is 0.4pt lower on the page.
maxRowDistance: 2

# Define the output data record.
# Since the main record type we're collecting information on is our employees,
# we'll have that be the root type for our harvested information.
rootRecordType: employee
recordTypes:
  employee:
    label: "employee" # Labels are used when nested recordTypes come into play, like this document.
    valueTypes:
      # Not sure what to name a valueType? Just make something up!
      - employee
      - name
      - hiredate
      - occupation
      - showinfo
      - bool1
      - bool2
      - bool3
      - salary
    children:
      # In this example, there are multiple children nested under an employee,
      # so we'll treat it as a 'child' to the 'employee' recordType.
      - child
  child:
    label: "child"
    valueTypes:
      - child
      - grade

valueTypes:
  employee:
    # In the CSV, use "Employee ID" as the column header instead of "employee".
    label: "Employee ID"
  name:
    label: "Name"
  hiredate:
    label: "Hire Date"
  occupation:
    label: "Occupation"
  salary:
    label: "Salary"
  showinfo:
    label: "Important Info?"
  bool1:
    label: "Boolean 1"
  bool2:
    label: "Boolean 2"
  bool3:
    label: "Boolean 3"
  child:
    label: "Attending Child"
  grade:
    label: "Grade"

# Now we define the finite-state machine
# Let's name the state that our machine starts off with:
initialState: "INIT"

# When each text segment is encountered, each transition for the current state is checked.
states:
  INIT:
    transitions:
      # The first bit of text we reach is 'ID-0001', so we'll try the only transition that should work here.
      -
        # If this condition matches (which it should)
        condition: employee # Curious about the condition? Sxroll further down to the conditions section of this YAML.
        # Then we'll switch to the 'employee' state!
        nextState: employee

  employee: # ID number with the format 'ID-####'
    startRecord: true # When we enter this stage, we'll create a new "case" record.
    transitions:
      - # Now we move on to the name label. Once again, by varifying the condition and moving on after that.
        condition: namelabel
        nextState: namelabel

  namelabel:
    include: false # The label isn't important information in and of itself, so we can just not include it in the data.
    transitions:
      -
        condition: name
        nextState: name

  name:
    transitions:
      -
        # Sometimes a name will be in two segments, and we'll hit another 'name' text segment before anything else.
        # In that case, a state can transition to itself, compounding the information picked up in it.
        condition: name
        nextState: name
      -
        # Does the first condition not match the text? We move onto the next one.
        condition: hiredateLabel
        nextState: hiredateLabel

  hiredateLabel:
    include: false
    transitions:
      -
        condition: hiredateLabel
        nextState: hiredateLabel
      -
        condition: hiredate
        nextState: hiredate

  hiredate:
    transitions:
      -
        condition: occupationLabel
        nextState: occupationLabel

  occupationLabel:
    include: false
    transitions:
      -
        condition: occupation
        nextState: occupation

  occupation:
    transitions:
      -
        condition: occupation
        nextState: occupation
      -
        # This state and the next are an example of how you can choose, using conditions, what to include or not.
        # They share the same area of a document, but have qualities to them that can be distinguishable.
        # Does it meet 'showinfo' conditions? Then we go to the 'showinfo' state that includes it.
        condition: showinfo
        nextState: showinfo
      -
        # Doesn't meet 'showinfo'? Then check for 'notinfo' and exclude it.
        condition: notinfo
        nextState: notinfo
  showinfo:
    transitions:
      -
        condition: showinfo
        nextState: showinfo
      -
        condition: bool1
        nextState: bool1
  notinfo:
    include: false
    transitions:
      -
        condition: notinfo
        nextState: notinfo
      -
        condition: bool1
        nextState: bool1

  bool1:
    transitions:
      -
        condition: bool2
        nextState: bool2
  bool2:
    transitions:
      -
        condition: bool3
        nextState: bool3
  bool3:
    transitions:
      -
        condition: salaryLabel
        nextState: salaryLabel

  salaryLabel:
    include: false
    transitions:
      -
        condition: salary
        nextState: salary

  salary:
    transitions:
      -
        condition: childrenLabel
        nextState: childrenLabel
      -
        condition: employee
        nextState: employee
      -
        condition: end
        nextState: end

  childrenLabel:
    include: false
    transitions:
      -
        condition: childrenLabel
        nextState: childrenLabel
      -
        condition: childLabel
        nextState: childLabel

  childLabel:
    include: false
    transitions:
      -
        condition: child
        nextState: child

  child:
    # Here we reach a datatype nested within another datatype. We can start records using this child datatype.
    # In the process, we'll be making multiple rows for the parent datatype, each one holding onto it's own child.
    startRecord: true
    transitions:
      -
        condition: child
        nextState: child
      -
        condition: gradeLabel
        nextState: gradeLabel
      -
        condition: childLabel
        nextState: childLabel

  gradeLabel:
    include: false
    transitions:
      -
        # Normally, there would always been an instance of a grade appearing right after the label.
        # But in this document, we have one instance of that not happening under ID-0007's child.
        condition: grade
        nextState: grade
      -
        # So we just account for that possibility by adding a transition out of the label.
        condition: employee
        nextState: employee

  grade:
    transitions:
      -
        condition: employee
        nextState: employee
      -
        condition: childLabel
        nextState: childLabel
      -
        # Reach the end of the usable info in a document, but there's still text left to go?
        # An easy fix is to just create a looping, not-included state to finish the document off.
        condition: end
        nextState: end

  end:
    # We reached a point in the document where all the useful information is gone, but we still have text to go.
    include: false
    transitions:
      -
        # By using an always-true condition such as 'any', we can loop this state until the document has been completely gone through.
        condition: any
        nextState: end

# Here we define the conditions:
conditions:

  # An example of comparing text with regex.
  # In this case, we're making sure that the text contains the characters 'ID-' followed by any amount of numbers.
  employee: 'text =~ /ID-(\\d)*/'

  # You can match based on the x- and y- coordinates of the upper left and lower right corners of the rectangle
  # containing the text. ulx = Upper-Left X-coordinate. lry = Lower-Right Y-coordinate. Also uly and lrx.
  # You can define the lower and upper limit for each, inclusive.
  namelabel: '70 < ulx < 80 and font = "BCDFEE+Calibri-Bold"'

  # You can also match based on the type of font used, including if it was bolded or italicized.
  name: '112 < ulx < 200 and font = "BCDEEE+Calibri"'

  hiredateLabel: '230 < ulx < 270 and font = "BCDFEE+Calibri-Bold"'

  hiredate: '280 < ulx < 290 and font = "BCDEEE+Calibri"'

  occupationLabel: '391 < ulx < 393 and font = "BCDFEE+Calibri-Bold"'

  occupation: '394 < ulx < 700 and font = "BCDEEE+Calibri"'

  showinfo: 'font = "BCDJEE+Georgia"'

  notinfo:  'font = "BCDEEE+Calibri"'

  bool1:  'font = "BCDIEE+Cambria"'

  bool2:  'font = "BCDIEE+Cambria"'

  bool3:  'font = "BCDIEE+Cambria"'

  salaryLabel: '391 < ulx < 393 and font = "BCDFEE+Calibri-Bold"'

  salary: '394 < ulx < 700 and font = "BCDEEE+Calibri"'

  childrenLabel: '70 < ulx < 140 and font = "BCDFEE+Calibri-Bold" and text =~ /(Attending)|(Children:)/'

  childLabel: '230 < ulx < 240 and font = "BCDFEE+Calibri-Bold"'

  child: '230 < ulx < 380 and font = "BCDEEE+Calibri"'

  gradeLabel: '391 < ulx < 393 and font = "BCDFEE+Calibri-Bold"'

  grade: '394 < ulx < 700 and font = "BCDEEE+Calibri"'

  # You can also match based on the size of the font and on specific text.
  end: 'fontSize = 16.0 and text = "TOTAL:"'

  # Need a condition that is always true? "1=1" does that for you.
  any: "1 = 1"