const CsvParser = require('./lib/CsvParser') const CSVSniffer = require('csv-sniffer') // const Stream = require('readable-stream') const { Readable } = require('readable-stream') // quote.charAt(0); quote.slice(-1); quote.charAt(quote.length - 1) //* Parser of the input stream (JS string chunks that have UTF16 encoding in ES6) class Parser { // ValidatingParser constructor ({ relaxColumnCount, skipLinesWithError, delimiter, quotes, // Inferred if not specified newLine // Optionally used only when quotes or delimiter are inferred } = {} ) { this.relaxColumnCount = relaxColumnCount this.skipLinesWithError = skipLinesWithError this.delimiter = delimiter this.quotes = quotes this.newLine = newLine // this.logger = logger } import (input, { relaxColumnCount = this.relaxColumnCount, skipLinesWithError = this.skipLinesWithError } = {}) { // input.setEncoding('utf8') // console.log(`Parser.import(), input encoding: ${input.readableEncoding}`) const output = new CsvParser({ delimiter: this.delimiter, // ? this.delimiter : ',', // parsedMetadata.delimiter, // this.delimiter = ',' // ATTENTION: the spread operator ([...symb]) is required in case of a unicode quote that is represented with more than a single code point // TODO: consider distinct opening and closing unicode quotes quoteChar: this.quotes, // ? [...this.quotes][0] : '"', // parsedMetadata.quoteChar, // this.quoteChar = '"' newlineStr: this.newLine, relaxColumnCount, skipLinesWithError }) if (!input.readable && input.readableEnded) { output.emit('end') return output } // Process the remained input data input.on('error', (err) => { output.emit('error', err) console.warn('input.error()') }) // .on('end', () => { // if (!output.readable) { // output.emit('end') // } // }) // Identify delimiter and quotes, given the CSV stream if (!this.delimiter || !this.quotes) { let inpPart = '' // Partial input string // const inpDataStream = new Stream.Readable({ read () {} }) //* Processing of the partial input string const inpPartProc = () => { // Sniff the fetch data // const csvDelimiters = [',', ';', '\t'] const sniffer = new (CSVSniffer())() // (CsvSniffer())(csvDelimiters) // TODO: consider unicode delimiter and quotes const csvMeta = sniffer.sniff(inpPart, { delimiter: (this.delimiter || '')[0], quoteChar: (this.quotes || '')[0], newlineStr: this.newLine }) const updOpts = {} if (!this.delimiter && csvMeta.delimiter) { console.info(`inpPartProc()> Inferred delimiter: ${csvMeta.delimiter}`) updOpts.delimiter = this.delimiter = csvMeta.delimiter // console.info(`inpPartProc()> after sniffing: delimiter: ${output.parser.options.delimiter}, quote: ${output.parser.options.quote}`) // output.setDelimiter(this.delimiter) // || ',' } if (!this.quotes && csvMeta.quoteChar) { console.info(`inpPartProc()> Inferred quoteChar: ${csvMeta.quoteChar}`) updOpts.quoteChar = this.quotes = csvMeta.quoteChar // || '' // output.parser.setQuote([...this.quotes][0]) // || '"' } if (!this.newLine && csvMeta.newlineStr) { let cpoints = '' for (const cp of csvMeta.newlineStr) { cpoints += ' 0x' + cp.codePointAt(0).toString(16).toUpperCase() } console.info(`inpPartProc()> Inferred newLine (codepoints): ${cpoints}`) updOpts.newlineStr = this.newLine = csvMeta.newlineStr // || '' } if (Object.keys(updOpts).length !== 0) { output.updateOptions(updOpts) } // Process the sniffed data // console.info(`inpPartProc() after sniffing: delimiter: ${output.parser.options.delimiter}, quote: ${output.parser.options.quote}`) // ATTENTION: Readable.from is not available in the browser // const rdinp = Readable.from([inpPart]) const rdinp = new Readable() rdinp.push(inpPart) rdinp.push(null) rdinp.on('error', (err) => { output.emit('error', err) }).on('end', () => { console.debug(`rdinp.end()> input is {readable: ${input.readable}, ended: ${input.readableEnded}}`) if (!input.readable && input.readableEnded) { output.emit('end') return } input.on('end', () => { console.debug('input.end()') // if (!output.readable) { // output.emit('end') // } }) // .on('finish', () => { // console.debug('input.finish()') // // if (!output.readable) { // // output.emit('end') // // } // }) input.pipe(output) }).pipe(output, { end: false }) // console.debug(`inpPartProc() outputting, nlf: ${nlf}, ncr: ${ncr}, inpPart [${inpPart.length}]`) // input.pipe(output) } const linesMin = 10 // Min number of lines to read for the automatic inference const lengthMax = 512 * 1024 // Soft max number of codepoints to be read for sniffing const clf = '\n'.codePointAt(0) const ccr = '\r'.codePointAt(0) let nlf = 0 // Linefeed counter let ncr = 0 // Carriage return counter const onReadable = () => { let chunk // Use a loop to make sure we read all currently available data while ((chunk = input.read()) !== null) { // Count the number of newline symbols to identify the number of lines to process for (const c of chunk) { if (c === clf) { // codePointAt(0) ++nlf } else if (c === ccr) { ++ncr } // else console.log(c) } // inpDataStream.push(chunk) inpPart += chunk // console.debug(`onReadable(), nlf: ${nlf}, ncr: ${ncr}, inpPart [${inpPart.length}]`) if (Math.max(nlf, ncr) >= linesMin || inpPart.length >= lengthMax) { input.removeListener('readable', onReadable) inpPartProc() return } } if (chunk === null && inpPart) { input.removeListener('readable', onReadable) inpPartProc() } } input.on('readable', onReadable) } else input.pipe(output) return output } static import (input, options) { return (new Parser(options)).import(input) } } module.exports = Parser