# Default configuration for HathiTrust SIP validator.
# Refer to the specification for SIPs at:  http://bit.ly/1jboMIC

# You can disable a check by commenting it out (prepending a '#' character)
# Checks to consider disabling:
# 
# MetaYml::PageData::Presence - if you are not producing page tag / page number data
# 
# OCR::CoordinatePresence - if you are not producing coordinate OCR (e.g. ALTO,
# hOCR, etc in a .html or .xml file)

---
# Checks to run on the overall package
package_checks:
 - Package::FileTypes: []
# Warns if marc.xml is in the package (it is no longer necessary and will not
# be used)
 - Package::MarcXML: []
# Gives an error if there is more than one PDF file in the package.
 - Package::PDFCount: []
# Warns for each YAML and md5 file other than meta.yml and
# checksum.md5
 - Package::ExtraFiles: []
# Gives an error for each filename duplicated in the SIP (i.e. two files with
# the same filename but different paths in the ZIP file)
 - Package::DuplicateFilenames: []
# Warns for each file with an unexpected file type in the ZIP
 - Package::FileBasenames: []
# Gives an error if meta.yml is not in the package. 
 - MetaYml::Exists: []
# Gives an error if meta.yml can't be loaded or parsed.  Most other package
# validators depend on this and will not run if meta.yml is missing or
# malformed.
 - MetaYml::WellFormed: ['MetaYml::Exists']
# Warns if no reading order / scanning order is provided; gives an error if the
# values are not left-to-right or right-to-left, or if reading order is
# provided and scanning order is not, or vice versa.
 - MetaYml::PageOrder: ['MetaYml::WellFormed']
# Warns if there isn't any page number / page tag data in meta.yml. Disable this if you 
# are not producing page data 
 - MetaYml::PageData::Presence: ['MetaYml::WellFormed']
# Gives an error for each page tag that is not in the allowed set (see specification)
 - MetaYml::PageData::PageTags: ['MetaYml::WellFormed']
# Gives an error for each page data value that is not in the correct format
# (e.g. {label: 'pagetag', orderlabel: 'pagenumber' }
 - MetaYml::PageData::Values: ['MetaYml::WellFormed']
# Gives an error for each page data key that is not in the correct format (e.g.
# 00000001.tif)
 - MetaYml::PageData::Keys: ['MetaYml::WellFormed']
# Warns for each page data key that refers to a file that is not in the package.
 - MetaYml::PageData::Files: ['MetaYml::WellFormed']
# Warns for each unknown key in meta.yml
 - MetaYml::UnknownKeys: ['MetaYml::WellFormed']
# Warns for each required key in meta.yml that is not present.
# Currently only capture_date is unconditionally required.
 - MetaYml::RequiredKeys: ['MetaYml::WellFormed']
# Gives an error for each date in meta.yml that isn't in ISO8601 combined
# format (e.g.  2016-12-08T01:02:03-05:00)
 - MetaYml::DateFormat: ['MetaYml::WellFormed']
# Gives an error if checksum.md5 is missing from the SIP
 - Checksums::Exists: []
# Gives an error for each line in checksum.md5 that doesn't appear to contain a
# MD5 checksum (32 hexadecimal digits). If there are any such malformed checksums,
# checksum validation won't run.
 - Checksums::WellFormed: ['Checksums::Exists']
# Gives an error for each file in the package that does not have a checksum in
# checksum.md5
 - Checksums::FileListComplete: ['Checksums::WellFormed']
# Gives an error for each missing, duplicated, or malformed sequence number
# (filename of image files without extension, e.g. '00000001' for
# '00000001.tif')
 - Image::Sequence: []
# Warns for each image that is missing a corresponding .txt OCR file.
 - OCR::Presence: []
# Gives an error for each .txt OCR file that does not have a corresponding .tif
# or .jp2 image file.
 - OCR::HasImage: []
# Warns for each .txt OCR file that does not have a corresponding
# .html or .xml coordinate OCR. Disable this if you are not submitting
# coordinate OCR.
 - OCR::CoordinatePresence: []
# Warns if the package contains a mix of .html and .xml coordinate
# OCR files
 - OCR::CoordinateFormat: []
# Gives an error for each .html or .xml file (coordinate OCR) that does not
# have a corresponding plain-text OCR file.
 - OCR::CoordinateHasPlain: []

# Checks to run for each relevant file in the package.
file_checks:
# Gives an error for each .txt, .html or .xml file that has invalid UTF-8 byte
# sequences
 - OCR::UTF8: []
# Gives an error for each .txt, .html, or .xml file that has control characters
# other than tab, line feed, and carriage return (i.e. contains
# any unicode characters U+0000-U+001F except U+0009, U+000A, U+000D) 
 - OCR::ControlChars: ['OCR::UTF8']
# Gives an error for each .html or .xml coordinate OCR file that is not
# well-formed XML
 - OCR::WellFormedXML: []
# Gives an error for each file whose checksum does not match the one given in
# checksum.md5
 - Checksums::ExpectedValue: ['Checksums::WellFormed']