#line 1 "api.c" #include #line 1 "api.h" #ifndef VOLUBILE_H #define VOLUBILE_H #define VB_VERSION "0.1" #include #include #include /* Defined in "mini.h". See https://github.com/michaelnmmeyer/mini */ struct mini; /* Error codes. */ enum { VB_OK, /* No error. */ VB_E2LONG, /* Query string too long. */ VB_EPAGE, /* Page size too large. */ VB_EQUTF8, /* Query string is not valid UTF-8. */ VB_ELUTF8, /* Lexicon contains an invalid UTF-8 string. */ VB_EFSA, /* Lexicon is not a numbered automaton. */ }; /* Returns a string describing an error code. */ const char *vb_strerror(int err); enum vb_match_mode { VB_AUTO, /* Select the matching mode automatically. */ VB_EXACT, /* Match a literal string, no magic character. */ VB_PREFIX, /* Prefix matching. */ VB_SUBSTR, /* Substring matching. */ VB_SUFFIX, /* Suffix matching. */ VB_GLOB, /* Glob matching. */ VB_LEVENSHTEIN, /* Levenshtein distance. */ VB_DAMERAU, /* Damerau-Levenshtein distance. */ VB_LCSUBSTR, /* Longest common substring. */ VB_LCSUBSEQ, /* Longest common subsequence. */ VB_MODES_NR }; #define VB_MAX_PAGE_SIZE 30 /* Maximum allowed number of words per page. */ struct vb_query { const char *query; /* Query string and its length. */ size_t len; enum vb_match_mode mode; /* Matching mode to use. */ size_t page_size; /* Maximum number of words to return page. */ /* Maximum allowed edit distance. Only taken into account when the matching * mode is VB_LEVENSHTEIN or VB_DAMERAU. */ int32_t max_dist; /* Length, in code points, of the prefix that must be shared between the * query word and some given word in the lexicon for the second to be * considered a potential match. This concerns fuzzy matching search * modes, at the exception of VB_LCSUBSTR. * Increasing this value accelerates fuzzy matching, but decreases recall. * 1 or 2 is fine; higher values are likely to be harmful. */ size_t prefix_len; /* State data for paginating matching words. Must be filled with zeroes the * first time vb_match() is called. After a call, these values are updated * in such a manner that, if vb_match() is called again with this same * structure, it will fetch the next results page, if any remain. */ struct vb_pagination { bool last_page; /* Whether we just returned the last page. */ uint32_t last_pos; /* Position of the last seen word. */ int32_t last_weight; /* Lowest rank among the returned words. */ } pagination; }; /* Sample initializer for the above structure. */ #define VB_QUERY_INIT { \ .query = "", \ .mode = VB_AUTO, \ .page_size = 10, \ .max_dist = 3, \ .prefix_len = 1, \ } /* Searches a lexicon. * The lexicon must be a numbered automaton. * The provided callback function will be called for each matching word. It will * be passed the current matching word. On success, returns VB_OK, otherwise an * error code. */ int vb_match(const struct mini *lexicon, struct vb_query *, void (*handler)(void *arg, const char *token, size_t len), void *arg); #endif #line 3 "api.c" #line 1 "priv.h" #ifndef VB_PRIV_H #define VB_PRIV_H #include #line 1 "mini.h" #ifndef MINI_H #define MINI_H #define MN_VERSION "0.1" #include #include #include /* Maximum length of a word, in bytes, discounting the terminal nul byte. */ #define MN_MAX_WORD_LEN 333 /* Error codes. * All functions below that return an int (except mn_contains()) return one of * these. */ enum { MN_OK, /* No error. */ MN_EWORD, /* Attempt to add the empty string or a too long word. */ MN_EORDER, /* Word added out of order. */ MN_EMAGIC, /* Magic identifier mismatch. */ MN_EVERSION, /* Version mismatch. */ MN_ECORRUPT, /* Automaton is corrupt. */ MN_EFREEZED, /* Attempt to add a word to a freezed automaton. */ MN_E2BIG, /* Automaton has grown too large. */ MN_EIO, /* IO error. */ }; /* Returns a string describing an error code. */ const char *mn_strerror(int err); /******************************************************************************* * Encoder ******************************************************************************/ enum mn_type { MN_STANDARD = 0, /* Classic automaton. */ MN_NUMBERED = 1, /* Numbered automaton. */ }; struct mini_enc; /* Allocates a new automaton encoder. * If MN_NUMBERED is chosen as the automaton type, a numbered automaton is * created instead of a classical one. This makes possible retrieving a word * given its ordinal, and retrieving a word ordinal given the word itself. On * the other hand, this doubles the automaton size. */ struct mini_enc *mn_enc_new(enum mn_type type); /* Destructor. */ void mn_enc_free(struct mini_enc *); /* Encodes a new word. * Words must be added in lexicographical order, must be sorted byte-wise, * must be unique, and their length must be greater than zero and not exceed * MN_MAX_WORD_LEN. On error, no new words should be added, and the encoder * should be cleared before anything else is done with it. */ int mn_enc_add(struct mini_enc *, const void *word, size_t len); /* Dumps an encoded automaton. * The provided callback will be called several times for writing the automaton * to some file or memory location. It must return zero on success, non-zero on * error. If it returns non-zero, this function will return MN_EIO. * * The automaton is freezed after this function is called, so no new words * should be added afterwards, unless mn_enc_clear() is called first. * This function can be called multiple times, unless it returns MN_E2BIG, which * indicates that the encoder should be reset with mn_enc_clear() before * anything else is done with it. */ int mn_enc_dump(struct mini_enc *, int (*write)(void *arg, const void *data, size_t size), void *arg); /* Dumps an encoded automaton to a file. * The provided file must be opened in binary mode, for writing. The underlying * file descriptor is not synced with the device, but the file handle is * flushed. */ int mn_enc_dump_file(struct mini_enc *, FILE *); /* Clears the internal structures. After this is called, the encoder object can * be used again to encode a new set of words. */ void mn_enc_clear(struct mini_enc *); /******************************************************************************* * Reader ******************************************************************************/ struct mini; /* Loads an automaton. * The provided callback will be called several times for reading the automaton. * It should return zero on success, non-zero on failure. A short read must be * considered as an error. * On success, makes the provided struct pointer point to the allocated * automaton. On failure, makes it point to NULL. */ int mn_load(struct mini **, int (*read)(void *arg, void *buf, size_t size), void *arg); /* Loads an automaton from a file. * The provided file must be opened in binary mode, for reading. */ int mn_load_file(struct mini **, FILE *); /* Destructor. */ void mn_free(struct mini *); /* Returns the type of an automaton. */ enum mn_type mn_type(const struct mini *); /* Returns the number of words in an automaton. * If the automaton is numbered, this is a constant time operation. Otherwise, * this requires traversing the entire automaton. */ uint32_t mn_size(const struct mini *); /* Checks if an automaton contains a word. * Returns 1 if so, 0 otherwise. */ int mn_contains(const struct mini *, const void *word, size_t len); /* Returns the ordinal of a word. * If the automaton is numbered and the word is present in the automaton, * return a strictly positive integer indicating its position (starting at 1), * otherwise 0. */ uint32_t mn_locate(const struct mini *, const void *word, size_t len); /* Extracts a word given its ordinal. * The size of the provided buffer must be at least MN_MAX_WORD_LEN + 1. * Returns the length of the corresponding word. If the automaton isn't numbered * or the provided index is invalid, adds a nul character to the provided * buffer, and returns 0. */ size_t mn_extract(const struct mini *, uint32_t pos, void *word); /******************************************************************************* * Iterator ******************************************************************************/ /* Automaton iterator. */ struct mini_iter { const struct mini *fsa; /* Attached automaton. */ size_t root; /* Root depth. */ size_t depth; /* Current stack depth. */ uint32_t positions[MN_MAX_WORD_LEN + 1]; /* Offsets stack. */ char word[MN_MAX_WORD_LEN + 1]; /* Current word. */ }; /* Returns an iterator over an automaton, starting at the very first word. * The iterator is initialized in such a manner that it will iterate * over the whole automaton. * Returns 1 if there are words to iterate on, 0 otherwise. */ uint32_t mn_iter_init(struct mini_iter *, const struct mini *); /* Returns an iterator over all words of an automaton that start with a given * prefix, the prefix itself included if it exists as a full word in the * automaton. * If there are matching words, returns the position of the first matching * word if the automaton is numbered, 1 if it isn't. If there are no matching * words, returns 0. */ uint32_t mn_iter_initp(struct mini_iter *, const struct mini *, const void *prefix, size_t len); /* Returns an iterator over an automaton, starting at a word, and including * that word if it is part of the lexicon. * This differs from mn_iter_initp() in that the lexicon isn't required to * contain at least one word starting with the provided string, and in that the * iterator is initialized in such a manner that it will yield all remaining * words in the lexicon. * If there are matching words, returns the position of the first matching * word if the automaton is numbered, 1 if it isn't. If there are no matching * words, returns 0. */ uint32_t mn_iter_inits(struct mini_iter *, const struct mini *, const void *word, size_t len); /* Returns an iterator over an automaton, starting at a given ordinal, and * including it. Iteration will stop after the last word of the automaton has * been fetched. * Returns the position given at argument if it is valid and the automaton is * numbered, otherwise 0. */ uint32_t mn_iter_initn(struct mini_iter *, const struct mini *, uint32_t pos); /* Fetches the next word from an iterator. * If "len" is not NULL, it will be assigned the length of the returned word. * If there are no remaining words, "len" is set to 0 if it is not NULL, and * NULL is returned. */ const char *mn_iter_next(struct mini_iter *, size_t *len); /******************************************************************************* * Debugging. ******************************************************************************/ /* Output formats. */ enum mn_dump_format { MN_DUMP_TXT, /* One word per line. */ MN_DUMP_TSV, /* TSV file, one line per transition, the first one giving * the field names. */ MN_DUMP_DOT, /* DOT file, for visualization with Graphviz. */ }; /* Writes a description of an automaton to a file. */ int mn_dump(const struct mini *, FILE *, enum mn_dump_format); #endif #line 7 "priv.h" struct vb_match_ctx { struct vb_query *query; /* The query string after simplification (magic characters removed, as well * as leading and trailing wildcards, if applicable). */ enum vb_match_mode mode; const char *str; size_t len; void (*handler)(void *arg, const char *token, size_t len); void *arg; }; extern int (*const vb_match_funcs[VB_MODES_NR])(const struct mini *, struct vb_match_ctx *); void vb_parse_query(struct vb_match_ctx *, char [static MN_MAX_WORD_LEN + 1]); /* Decodes a UTF-8 string. * "dest" should be large enough to hold (len + 1) code points. * "len" must be < INT32_MAX. * The output string will be nul-terminated. * Returns the length of the decoded string on success, -1 otherwise. * We only do a surface check, not a thorough one that involves examining each * byte plus the value of the decoded code points. The goal is to detect * unexpected binary data and encoding errors that could easily lead to memory * corruptions. */ int32_t vb_utf8_decode(char32_t *restrict dest, const char *restrict str, size_t len); /* Length, in bytes, of the first "nr" code points of "str" if it was encoded * as UTF-8. */ size_t vb_utf8_bytes(const char32_t *str, size_t nr); #endif #line 4 "api.c" const char *vb_strerror(int err) { static const char *const tbl[] = { [VB_OK] = "no error", [VB_E2LONG] = "query string too long", [VB_EPAGE] = "page size too large", [VB_EQUTF8] = "query string is not valid UTF-8", [VB_ELUTF8] = "lexicon contains an invalid UTF-8 string", [VB_EFSA] = "lexicon is not a numbered automaton", }; if (err >= 0 && (size_t)err < sizeof tbl / sizeof *tbl) return tbl[err]; return "unknown error"; } int vb_match(const struct mini *lex, struct vb_query *q, void (*callback)(void *arg, const char *token, size_t len), void *arg) { if (mn_type(lex) != MN_NUMBERED) return VB_EFSA; if (q->page_size > VB_MAX_PAGE_SIZE) return VB_EPAGE; /* Technically, there can be a match if the query is longer than the longest * possible word that can be held in an automaton. But we must define a limit * because a) we use stack-allocated buffers for decoding strings, and b) * we cannot do fuzzy matching on very long strings (hard limit defined in * faconde.h). This is unlikely to be a problem anyway. */ if (q->len > MN_MAX_WORD_LEN) return VB_E2LONG; if (q->page_size == 0 || q->pagination.last_pos == UINT32_MAX) q->pagination.last_page = true; if (q->pagination.last_page) return VB_OK; struct vb_match_ctx c = { .query = q, .mode = q->mode, .str = q->query, .len = q->len, .handler = callback, .arg = arg, }; if (c.mode >= sizeof vb_match_funcs / sizeof *vb_match_funcs) c.mode = VB_AUTO; char buf[MN_MAX_WORD_LEN + 1]; vb_parse_query(&c, buf); int ret = vb_match_funcs[c.mode](lex, &c); if (q->pagination.last_page) q->pagination.last_pos = UINT32_MAX; return ret; } #line 1 "match.c" #include #include #include #line 1 "faconde.h" #ifndef FACONDE_H #define FACONDE_H #define FC_VERSION "0.1" #include #include #include /* Maximum allowed length of a sequence. We don't check internally that this * limit is respected. It could be larger, but this is unlikely to be useful. * With this value, the worst-case allocation is 64M. */ #define FC_MAX_SEQ_LEN 4096 /* Check if a string matches a glob pattern. * Matching is case-sensitive, and is performed over the whole string. The * supported syntax is as follows: * * ? matches a single character * * matches zero or more characters * [abc] matches any of the characters a, b, or c * [^abc] matches any character but a, b, and c * * Character classes are not supported. * * The characters `[`, `?`, and `*` are interpreted as literals when in a group. * The character `]`, to be included in a group, must be placed in first * position. The character `^`, if included in a group and intended to be * interpreted as a literal, must not be placed at the beginning of the group. * The character `]`, if not preceded by `[`, is interpreted as a literal. * * If the pattern is invalid, returns false. */ bool fc_glob(const char32_t *pat, const char32_t *str); /******************************************************************************* * Levenshtein/Damerau distance ******************************************************************************/ /* Normalization strategies for Levenshtein and Damerau. * FC_NORM_LSEQ Normalize by the length longest sequence. * FC_NORM_LALIGN Normalize by the longest alignement between the two input * sequences. This is more expensive (both in terms of space and * time) than FC_NORM_LSEQ, but (arguably) more accurate. For * details, see Heeringa, "Measuring Dialect Pronunciation * Differences using Levenshtein Distance". */ enum fc_norm_method { FC_NORM_LSEQ, FC_NORM_LALIGN, }; /* Computes the absolute Levenshtein distance between two sequences. */ int32_t fc_levenshtein(const char32_t *seq1, int32_t len1, const char32_t *seq2, int32_t len2); /* Computes a normalized Levenshtein distance between two sequences. */ double fc_nlevenshtein(enum fc_norm_method method, const char32_t *seq1, int32_t len1, const char32_t *seq2, int32_t len2); /* Computes the absolute Damerau distance between two sequences. */ int32_t fc_damerau(const char32_t *seq1, int32_t len1, const char32_t *seq2, int32_t len2); /* Computes a normalized Damerau distance between two sequences. */ double fc_ndamerau(enum fc_norm_method method, const char32_t *seq1, int32_t len1, const char32_t *seq2, int32_t len2); /* Computes the distance between the provided sequences upto a maximum value * of 1. If the distance between the sequences is larger than that, a value * larger than 1 is returned. */ int32_t fc_lev_bounded1(const char32_t *seq1, int32_t len1, const char32_t *seq2, int32_t len2); /* Same as "fc_lev_bounded1()", but for distance 2. */ int32_t fc_lev_bounded2(const char32_t *seq1, int32_t len1, const char32_t *seq2, int32_t len2); /* Table of pointers to the above functions. * The function at index 0 is a dummy one that compares sequences for equality. */ extern int32_t (*const fc_lev_bounded[3])(const char32_t *, int32_t, const char32_t *, int32_t); /* Computes the jaro distance between two sequences. * Contrary to the canonical implementation, this returns 0 for identity, and * 1 to indicate absolute difference, instead of the reverse. */ double fc_jaro(const char32_t *seq1, int32_t len1, const char32_t *seq2, int32_t len2); /******************************************************************************* * Longest Common Substring and Subsequence ******************************************************************************/ /* Computes the length of the longest common substring between two sequences. */ int32_t fc_lcsubstr(const char32_t *seq1, int32_t len1, const char32_t *seq2, int32_t len2); /* Like fc_lcsubstr(), but also makes possible the extraction of a longest * common substring. If "pos" is not NULL, it is made to point to the leftmost * longest common substring in "seq1". If the length of the longest common * substring is zero, "pos", if not NULL, is made to point to the end of "seq1". */ int32_t fc_lcsubstr_extract(const char32_t *seq1, int32_t len1, const char32_t *seq2, int32_t len2, const char32_t **pos); /* Computes the length of the longest common subsequence between two * sequences. */ int32_t fc_lcsubseq(const char32_t *seq1, int32_t len1, const char32_t *seq2, int32_t len2); /* Normalized version of fc_lcsubseq(). */ double fc_nlcsubseq(const char32_t *seq1, int32_t len1, const char32_t *seq2, int32_t len2); /******************************************************************************* * Memoized string metrics. ******************************************************************************/ enum fc_metric { FC_LEVENSHTEIN, FC_DAMERAU, FC_LCSUBSTR, FC_LCSUBSEQ, FC_METRIC_NR, }; struct fc_memo { int32_t (*compute)(struct fc_memo *, const char32_t *, int32_t); void *matrix; /* Similarity matrix. */ int32_t mdim; /* Matrix dimension. */ const char32_t *seq1; /* Reference sequence. */ int32_t len1; /* Length of the reference sequence. */ char32_t *seq2; /* Previous sequence seen. */ int32_t len2; /* Length of this sequence. */ int32_t max_dist; /* Maximum allowed distance (for Levenshtein). */ }; /* Initializer. * metric: the metric to use. * max_len: the maximum possible length of a sequence (or higher). The * internal matrix is never reallocated. * max_dist: the maximum allowed edit distance (the lower, the faster). This * parameter is only used if the chosen metric is Levenshtein or Damerau. */ void fc_memo_init(struct fc_memo *, enum fc_metric metric, int32_t max_len, int32_t max_dist); /* Destructor. */ void fc_memo_fini(struct fc_memo *); /* Returns the chosen metric. */ enum fc_metric fc_memo_metric(const struct fc_memo *); /* Sets the reference sequence. * It is not copied internally, and should then be available until either a new * reference sequence is set, or this object is deinitialized. The reference * sequence can be changed several times without deinitializing this object * first. */ void fc_memo_set_ref(struct fc_memo *, const char32_t *seq1, int32_t len1); /* Compares the reference sequence to a new one. */ static inline int32_t fc_memo_compute(struct fc_memo *m, const char32_t *seq2, int32_t len2) { return m->compute(m, seq2, len2); } /* Concrete prototypes for the memoized string metrics functions. * The function called must match the chosen metric. */ int32_t fc_memo_levenshtein(struct fc_memo *, const char32_t *, int32_t); int32_t fc_memo_damerau(struct fc_memo *, const char32_t *, int32_t); int32_t fc_memo_lcsubstr(struct fc_memo *, const char32_t *, int32_t); int32_t fc_memo_lcsubseq(struct fc_memo *, const char32_t *, int32_t); #endif #line 5 "match.c" #line 1 "heap.h" #ifndef VB_HEAP_H #define VB_HEAP_H #include #include #include #define VB_HEAP_INIT(array, max_elems) {.data = array, .max = max_elems} #define VB_HEAP_DECLARE(NAME, T, COMPAR) \ struct NAME { \ T *data; \ size_t size; \ size_t max; \ }; \ \ static void NAME##_push(struct NAME *heap, T item) \ { \ assert(heap->max > 0); \ T *restrict data = heap->data; \ const size_t max = heap->max; \ \ if (heap->size < max) { \ size_t cur = heap->size; \ while (cur) { \ const size_t par = (cur - 1) >> 1; \ if (COMPAR(data[par], item) >= 0) \ break; \ data[cur] = data[par]; \ cur = par; \ } \ data[cur] = item; \ heap->size++; \ } else if (COMPAR(item, data[0]) < 0) { \ size_t cur = 0; \ const size_t lim = max >> 1; \ while (cur < lim) { \ size_t child = (cur << 1) + 1; \ if (child < max - 1 && COMPAR(data[child], data[child + 1]) < 0) \ child++; \ if (COMPAR(item, data[child]) >= 0) \ break; \ data[cur] = data[child]; \ cur = child; \ } \ data[cur] = item; \ } \ } \ \ static int NAME##_qsort_cmp_(const void *a, const void *b) \ { \ return COMPAR(*(const T *)a, *(const T *)b); \ } \ static void NAME##_finish(struct NAME *heap) \ { \ qsort(heap->data, heap->size, sizeof *heap->data, NAME##_qsort_cmp_); \ } #endif #line 8 "match.c" static const char glob_chars[] = "*?[]"; /******************************************************************************* * Pattern matching. ******************************************************************************/ static int match_exact(const struct mini *lex, struct vb_match_ctx *c) { if (mn_contains(lex, c->str, c->len)) c->handler(c->arg, c->str, c->len); c->query->pagination.last_page = true; return VB_OK; } static int match_prefix(const struct mini *lex, struct vb_match_ctx *c) { bool first_page; struct mini_iter it; uint32_t pos = c->query->pagination.last_pos; if (pos) { first_page = false; mn_iter_initn(&it, lex, pos); } else { first_page = true; pos = mn_iter_initp(&it, lex, c->str, c->len); } const char *term; size_t len; size_t page_size = c->query->page_size; while ((term = mn_iter_next(&it, &len))) { if (!first_page && (len < c->len || memcmp(c->str, term, c->len))) break; if (!page_size--) { c->query->pagination.last_pos = pos; return VB_OK; } else { c->handler(c->arg, term, len); } pos++; } c->query->pagination.last_page = true; return VB_OK; } static int match_substr(const struct mini *lex, struct vb_match_ctx *c) { struct mini_iter it; uint32_t pos = c->query->pagination.last_pos; if (pos) { mn_iter_initn(&it, lex, pos); } else { pos = mn_iter_init(&it, lex); } const char *term; size_t len; size_t page_size = c->query->page_size; while ((term = mn_iter_next(&it, &len))) { if (len >= c->len && strstr(term, c->str)) { if (!page_size--) { c->query->pagination.last_pos = pos; return VB_OK; } else { c->handler(c->arg, term, len); } } pos++; } c->query->pagination.last_page = true; return VB_OK; } static int match_suffix(const struct mini *lex, struct vb_match_ctx *c) { uint32_t pos = c->query->pagination.last_pos; struct mini_iter it; if (pos) { mn_iter_initn(&it, lex, pos); } else { pos = mn_iter_init(&it, lex); } const char *term; size_t len; size_t page_size = c->query->page_size; while ((term = mn_iter_next(&it, &len))) { if (len >= c->len && !memcmp(c->str, &term[len - c->len], c->len)) { if (!page_size--) { c->query->pagination.last_pos = pos; return VB_OK; } else { c->handler(c->arg, term, len); } } pos++; } c->query->pagination.last_page = true; return VB_OK; } static int match_glob(const struct mini *lex, struct vb_match_ctx *c) { struct mini_iter it; uint32_t pos = c->query->pagination.last_pos; size_t pfx_len = strcspn(c->str, glob_chars); if (pos) { mn_iter_initn(&it, lex, pos); } else { pos = mn_iter_initp(&it, lex, c->str, pfx_len); } int ret = VB_OK; char32_t upat[MN_MAX_WORD_LEN + 1]; if (vb_utf8_decode(upat, &c->str[pfx_len], c->len - pfx_len) < 0) { ret = VB_EQUTF8; goto fini; } const char *term; size_t len; size_t page_size = c->query->page_size; while ((term = mn_iter_next(&it, &len))) { if (pfx_len && (len < pfx_len || memcmp(term, c->str, pfx_len))) break; char32_t uterm[MN_MAX_WORD_LEN + 1]; if (vb_utf8_decode(uterm, &term[pfx_len], len - pfx_len) < 0) { ret = VB_ELUTF8; goto fini; } if (fc_glob(upat, uterm)) { if (!page_size--) { c->query->pagination.last_pos = pos; return VB_OK; } else { c->handler(c->arg, term, len); } } pos++; } fini: c->query->pagination.last_page = true; return ret; } /******************************************************************************* * Fuzzy matching. ******************************************************************************/ static int32_t levenshtein_compute(struct fc_memo *m, const char32_t *seq, int32_t len) { int32_t ret = fc_memo_levenshtein(m, seq, len); return ret > m->max_dist ? INT32_MAX : ret; } static int32_t damerau_compute(struct fc_memo *m, const char32_t *seq, int32_t len) { int32_t ret = fc_memo_damerau(m, seq, len); return ret > m->max_dist ? INT32_MAX : ret; } static int32_t lcsubstr_compute(struct fc_memo *m, const char32_t *seq, int32_t len) { return -fc_memo_lcsubstr(m, seq, len); } static int32_t lcsubseq_compute(struct fc_memo *m, const char32_t *seq, int32_t len) { return -2. * fc_memo_lcsubseq(m, seq, len) / (double)(m->len1 + len) * 1000.; } struct vb_match_infos { uint32_t pos; int32_t weight; }; static int vb_match_infos_cmp(const struct vb_match_infos a, const struct vb_match_infos b) { if (a.weight == b.weight) return a.pos < b.pos ? -1 : a.pos > b.pos; return a.weight < b.weight ? -1 : a.weight > b.weight; } VB_HEAP_DECLARE(vb_heap, struct vb_match_infos, vb_match_infos_cmp) static int match_fuzzy(const struct mini *lex, struct vb_match_ctx *c) { static const int metrics[] = { [VB_LEVENSHTEIN] = FC_LEVENSHTEIN, [VB_DAMERAU] = FC_DAMERAU, [VB_LCSUBSTR] = FC_LCSUBSTR, [VB_LCSUBSEQ] = FC_LCSUBSEQ, }; static int32_t (*const compute_fns[])(struct fc_memo *, const char32_t *, int32_t) = { [FC_LEVENSHTEIN] = levenshtein_compute, [FC_DAMERAU] = damerau_compute, [FC_LCSUBSTR] = lcsubstr_compute, [FC_LCSUBSEQ] = lcsubseq_compute, }; char32_t seq1[MN_MAX_WORD_LEN + 1]; int32_t len1 = vb_utf8_decode(seq1, c->str, c->len); if (len1 < 0) { c->query->pagination.last_page = true; return VB_EQUTF8; } struct mini_iter it; uint32_t pos; if (c->query->prefix_len && !c->query->pagination.last_pos && c->mode != VB_LCSUBSTR) { size_t pfx_len; /* If the required common prefix length is longer than the reference word, * there could still be an exact match. */ if (c->query->prefix_len > len1) return match_exact(lex, c); pfx_len = vb_utf8_bytes(seq1, c->query->prefix_len); pos = mn_iter_initp(&it, lex, c->str, pfx_len); } else { pos = mn_iter_init(&it, lex); } enum fc_metric metric = metrics[c->mode]; struct fc_memo m; fc_memo_init(&m, metric, MN_MAX_WORD_LEN, c->query->max_dist); fc_memo_set_ref(&m, seq1, len1); m.compute = compute_fns[metric]; const char *term; size_t len; size_t count = 0; bool first_page = c->query->pagination.last_pos == 0; struct vb_match_infos cands[VB_MAX_PAGE_SIZE]; struct vb_heap heap = VB_HEAP_INIT(cands, c->query->page_size); struct vb_match_infos last_min = { .pos = c->query->pagination.last_pos, .weight = c->query->pagination.last_weight, }; while ((term = mn_iter_next(&it, &len))) { char32_t seq2[MN_MAX_WORD_LEN + 1]; int32_t len2 = vb_utf8_decode(seq2, term, len); if (len2 < 0) { c->query->pagination.last_page = true; return VB_ELUTF8; } struct vb_match_infos x = { .pos = pos, .weight = fc_memo_compute(&m, seq2, len2), }; if (x.weight != INT32_MAX && (first_page || vb_match_infos_cmp(x, last_min) > 0)) { count++; vb_heap_push(&heap, x); } pos++; } fc_memo_fini(&m); vb_heap_finish(&heap); for (size_t i = 0; i < heap.size; i++) { len = mn_extract(lex, heap.data[i].pos, seq1); c->handler(c->arg, (const char *)seq1, len); } if (heap.size) { c->query->pagination.last_pos = heap.data[heap.size - 1].pos; c->query->pagination.last_weight = heap.data[heap.size - 1].weight; } if (count <= heap.size) c->query->pagination.last_page = true; return VB_OK; } int (*const vb_match_funcs[])(const struct mini *, struct vb_match_ctx *) = { [VB_EXACT] = match_exact, [VB_PREFIX] = match_prefix, [VB_SUBSTR] = match_substr, [VB_SUFFIX] = match_suffix, [VB_GLOB] = match_glob, [VB_LEVENSHTEIN] = match_fuzzy, [VB_DAMERAU] = match_fuzzy, [VB_LCSUBSTR] = match_fuzzy, [VB_LCSUBSEQ] = match_fuzzy, }; #line 1 "parse.c" #line 1 "src/parse.rl" #include #line 43 "src/parse.rl" #line 12 "src/parse.c" static const char _vb_simplify_glob_actions[] = { 0, 1, 0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5 }; static const char _vb_simplify_glob_key_offsets[] = { 0, 4, 8, 9, 10, 11, 15, 19 }; static const char _vb_simplify_glob_trans_keys[] = { 42, 63, 91, 93, 42, 63, 91, 93, 42, 42, 42, 42, 63, 91, 93, 42, 63, 91, 93, 42, 0 }; static const char _vb_simplify_glob_single_lengths[] = { 4, 4, 1, 1, 1, 4, 4, 1 }; static const char _vb_simplify_glob_range_lengths[] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static const char _vb_simplify_glob_index_offsets[] = { 0, 5, 10, 12, 14, 16, 21, 26 }; static const char _vb_simplify_glob_indicies[] = { 1, 2, 2, 2, 0, 3, 2, 2, 2, 0, 4, 2, 5, 2, 6, 2, 8, 2, 2, 2, 7, 9, 2, 2, 2, 7, 10, 2, 0 }; static const char _vb_simplify_glob_trans_targs[] = { 1, 5, 3, 2, 2, 4, 4, 6, 5, 7, 7 }; static const char _vb_simplify_glob_trans_actions[] = { 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0 }; static const char _vb_simplify_glob_eof_actions[] = { 9, 9, 3, 11, 11, 11, 7, 5 }; static const int vb_simplify_glob_start = 0; static const int vb_simplify_glob_en_main = 0; #line 46 "src/parse.rl" static void vb_simplify_glob(struct vb_match_ctx *c, char *buf) { int cs; const char *p = c->str; const char *pe = &p[c->len]; const char *const eof = pe; size_t len = 0; #line 77 "src/parse.c" { cs = vb_simplify_glob_start; } #line 56 "src/parse.rl" #line 84 "src/parse.c" { int _klen; unsigned int _trans; const char *_acts; unsigned int _nacts; const char *_keys; if ( p == pe ) goto _test_eof; _resume: _keys = _vb_simplify_glob_trans_keys + _vb_simplify_glob_key_offsets[cs]; _trans = _vb_simplify_glob_index_offsets[cs]; _klen = _vb_simplify_glob_single_lengths[cs]; if ( _klen > 0 ) { const char *_lower = _keys; const char *_mid; const char *_upper = _keys + _klen - 1; while (1) { if ( _upper < _lower ) break; _mid = _lower + ((_upper-_lower) >> 1); if ( (*p) < *_mid ) _upper = _mid - 1; else if ( (*p) > *_mid ) _lower = _mid + 1; else { _trans += (unsigned int)(_mid - _keys); goto _match; } } _keys += _klen; _trans += _klen; } _klen = _vb_simplify_glob_range_lengths[cs]; if ( _klen > 0 ) { const char *_lower = _keys; const char *_mid; const char *_upper = _keys + (_klen<<1) - 2; while (1) { if ( _upper < _lower ) break; _mid = _lower + (((_upper-_lower) >> 1) & ~1); if ( (*p) < _mid[0] ) _upper = _mid - 2; else if ( (*p) > _mid[1] ) _lower = _mid + 2; else { _trans += (unsigned int)((_mid - _keys)>>1); goto _match; } } _trans += _klen; } _match: _trans = _vb_simplify_glob_indicies[_trans]; cs = _vb_simplify_glob_trans_targs[_trans]; if ( _vb_simplify_glob_trans_actions[_trans] == 0 ) goto _again; _acts = _vb_simplify_glob_actions + _vb_simplify_glob_trans_actions[_trans]; _nacts = (unsigned int) *_acts++; while ( _nacts-- > 0 ) { switch ( *_acts++ ) { case 0: #line 8 "src/parse.rl" { buf[len++] = (*p); } break; #line 162 "src/parse.c" } } _again: if ( ++p != pe ) goto _resume; _test_eof: {} if ( p == eof ) { const char *__acts = _vb_simplify_glob_actions + _vb_simplify_glob_eof_actions[cs]; unsigned int __nacts = (unsigned int) *__acts++; while ( __nacts-- > 0 ) { switch ( *__acts++ ) { case 1: #line 16 "src/parse.rl" { c->mode = VB_PREFIX; c->str = buf; c->len = len - 1; } break; case 2: #line 21 "src/parse.rl" { buf[len - 1] = '\0'; /* We use strstr() in the matching function. */ c->mode = VB_SUBSTR; c->str = buf + 1; c->len = len - 2; } break; case 3: #line 27 "src/parse.rl" { c->mode = VB_SUFFIX; c->str = buf + 1; c->len = len - 1; } break; case 4: #line 32 "src/parse.rl" { c->mode = VB_EXACT; } break; case 5: #line 35 "src/parse.rl" { c->mode = VB_GLOB; c->str = buf; c->len = len; } break; #line 215 "src/parse.c" } } } } #line 57 "src/parse.rl" (void)vb_simplify_glob_en_main; } static void vb_set_match_mode(struct vb_match_ctx *c) { if (c->len == 0) { c->mode = VB_EXACT; return; } switch (*c->str) { #define _(C, NAME) \ case C: \ c->mode = VB_##NAME; \ c->str++; \ c->len--; \ break; _('+', LCSUBSTR) _('@', DAMERAU) _('#', SUBSTR) #undef _ default: c->mode = VB_GLOB; break; } } void vb_parse_query(struct vb_match_ctx *c, char *buf) { if (c->mode == VB_AUTO) vb_set_match_mode(c); if (c->mode == VB_SUBSTR) { /* We use strstr() in the matching function. */ memcpy(buf, c->str, c->len); buf[c->len] = '\0'; c->str = buf; } else if (c->mode == VB_GLOB) { vb_simplify_glob(c, buf); } } #line 1 "utf8.c" #include #include static const unsigned char vb_len_table[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; #define vb_char_len(c) vb_len_table[(unsigned char)(c)] static char32_t vb_decode_char(const unsigned char *str, size_t clen) { switch (clen) { case 1: return str[0]; case 2: return ((str[0] & 0x1F) << 6) | (str[1] & 0x3F); case 3: return ((str[0] & 0x0f) << 12) | ((str[1] & 0x3f) << 6) | (str[2] & 0x3f); default: assert(clen == 4); return ((str[0] & 0x07) << 18) | ((str[1] & 0x3f) << 12) | ((str[2] & 0x3f) << 6) | (str[3] & 0x3f); } } #define vb_decode_char(str, len) vb_decode_char((const unsigned char *)(str), len) int32_t vb_utf8_decode(char32_t *restrict dest, const char *restrict str, size_t len) { assert(len < INT32_MAX); int32_t ulen = 0; for (size_t i = 0; i < len; ) { size_t clen = vb_char_len(str[i]); if (clen == 0 || i + clen > len) return -1; dest[ulen++] = vb_decode_char(&str[i], clen); i += clen; } dest[ulen] = U'\0'; return ulen; } size_t vb_utf8_bytes(const char32_t *str, size_t nr) { size_t pfx = 0; for (size_t i = 0; i < nr; i++) { if (str[i] < 0x80) pfx++; else if (str[i] < 0x800) pfx += 2; else if (str[i] < 0x10000) pfx += 3; else pfx += 4; } return pfx; }