working v12

2026-05-17 13:10:44 +02:00
parent 6a87750329
commit dc273506b7
9 changed files with 121 additions and 50 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ steel_decimal/tests/property_tests.proptest-regressions
 canvas/*.toml
 .aider*
 .codex
 TODO.md
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -493,7 +493,7 @@ checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
 [[package]]
 name = "canvas"
-version = "0.6.7"
+version = "0.6.11"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -586,7 +586,7 @@ dependencies = [
 [[package]]
 name = "client"
-version = "0.6.7"
+version = "0.6.11"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -642,7 +642,7 @@ dependencies = [
 [[package]]
 name = "common"
-version = "0.6.7"
+version = "0.6.11"
 dependencies = [
 "prost 0.13.5",
 "prost-build 0.14.1",
@@ -3117,7 +3117,7 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
 [[package]]
 name = "search"
-version = "0.6.7"
+version = "0.6.11"
 dependencies = [
 "anyhow",
 "common",
@@ -3216,7 +3216,7 @@ dependencies = [
 [[package]]
 name = "server"
-version = "0.6.7"
+version = "0.6.11"
 dependencies = [
 "anyhow",
 "bcrypt",
@@ -4549,7 +4549,7 @@ dependencies = [
 [[package]]
 name = "validation-core"
-version = "0.6.7"
+version = "0.6.11"
 dependencies = [
 "regex",
 "serde",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,7 +5,7 @@ resolver = "2"
 [workspace.package]
 # TODO: idk how to do the name, fix later
 # name = "komp_ac"
-version = "0.6.9"
+version = "0.6.12"
 edition = "2021"
 license = "GPL-3.0-or-later"
 authors = ["Filip Priečinský <filippriec@gmail.com>"]
--- a/2
+++ b/2
--- a/common/src/search.rs
+++ b/common/src/search.rs
@@ -1,8 +1,8 @@
 use std::path::{Path, PathBuf};
 use tantivy::schema::{
-    Field, IndexRecordOption, JsonObjectOptions, Schema, Term, TextFieldIndexing, INDEXED, STORED,
+    Field, IndexRecordOption, JsonObjectOptions, Schema, Term, TextFieldIndexing, TextOptions,
-    STRING,
+    INDEXED, STORED, STRING,
 };
 use tantivy::tokenizer::{
    AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter,
@@ -13,6 +13,7 @@ use tantivy::Index;
 pub const F_PG_ID: &str = "pg_id";
 pub const F_TABLE_NAME: &str = "table_name";
 pub const F_ROW_KEY: &str = "row_key";
 pub const F_ALL_TEXT: &str = "all_text";
 pub const F_DATA_WORD: &str = "data_word";
 pub const F_DATA_NGRAM: &str = "data_ngram";
 pub const F_DATA_EXACT: &str = "data_exact";
@@ -59,6 +60,7 @@ pub fn create_search_schema() -> Schema {
    schema_builder.add_u64_field(F_PG_ID, INDEXED | STORED);
    schema_builder.add_text_field(F_TABLE_NAME, STRING | STORED);
    schema_builder.add_text_field(F_ROW_KEY, STRING | STORED);
    schema_builder.add_text_field(F_ALL_TEXT, text_options(TOK_WORD));
    schema_builder.add_json_field(F_DATA_WORD, json_options(TOK_WORD, true, false));
    schema_builder.add_json_field(F_DATA_NGRAM, json_options(TOK_NGRAM, true, false));
@@ -67,6 +69,14 @@ pub fn create_search_schema() -> Schema {
    schema_builder.build()
 }
 fn text_options(tokenizer_name: &str) -> TextOptions {
    let indexing = TextFieldIndexing::default()
        .set_tokenizer(tokenizer_name)
        .set_index_option(IndexRecordOption::WithFreqsAndPositions);
    TextOptions::default().set_indexing_options(indexing)
 }
 fn json_options(tokenizer_name: &str, with_positions: bool, stored: bool) -> JsonObjectOptions {
    let index_option = if with_positions {
        IndexRecordOption::WithFreqsAndPositions
@@ -153,6 +163,7 @@ pub struct SchemaFields {
    pub pg_id: Field,
    pub table_name: Field,
    pub row_key: Field,
    pub all_text: Field,
    pub data_word: Field,
    pub data_ngram: Field,
    pub data_exact: Field,
@@ -164,6 +175,7 @@ impl SchemaFields {
            pg_id: get_field(schema, F_PG_ID)?,
            table_name: get_field(schema, F_TABLE_NAME)?,
            row_key: get_field(schema, F_ROW_KEY)?,
            all_text: get_field(schema, F_ALL_TEXT)?,
            data_word: get_field(schema, F_DATA_WORD)?,
            data_ngram: get_field(schema, F_DATA_NGRAM)?,
            data_exact: get_field(schema, F_DATA_EXACT)?,
--- a/search/src/lib.rs
+++ b/search/src/lib.rs
@@ -112,6 +112,7 @@ impl SearcherService {
        Ok(Response::new(SearchResponse { hits }))
    }
 }
 struct ProfileIndex {
@@ -133,7 +134,7 @@ impl ProfileIndex {
            .map_err(|e| Status::internal(format!("Failed to build index reader: {}", e)))?;
        let fields = SchemaFields::from(&index.schema()).map_err(|e| {
            Status::internal(format!(
-                "Search index schema mismatch. Reindex required: {}",
+                "Search index schema mismatch. Delete the stale index and create it again: {}",
                e
            ))
        })?;
@@ -205,6 +206,22 @@ fn validate_identifier(value: &str, field_name: &str) -> Result<(), Status> {
    Ok(())
 }
 fn validate_search_column(value: &str) -> Result<(), Status> {
    if value.is_empty() {
        return Err(Status::invalid_argument(
            "constraint.column must not be empty",
        ));
    }
    if value.chars().any(|ch| ch.is_control() || ch == '\0') {
        return Err(Status::invalid_argument(
            "constraint.column contains invalid characters",
        ));
    }
    Ok(())
 }
 fn qualify_profile_table(profile_name: &str, table_name: &str) -> String {
    format!("\"{}\".\"{}\"", profile_name, table_name)
 }
@@ -258,12 +275,7 @@ fn normalize_request(req: SearchRequest) -> Result<NormalizedSearchRequest, Stat
    for constraint in req.must {
        let column = constraint.column.trim();
-        if column.is_empty() {
+        validate_search_column(column)?;
            return Err(Status::invalid_argument(
                "constraint.column must not be empty",
            ));
        }
        validate_identifier(column, "constraint.column")?;
        let query = constraint.query.trim();
        if query.is_empty() {
--- a/search/src/query_builder.rs
+++ b/search/src/query_builder.rs
@@ -1,5 +1,6 @@
 use common::search::{
-    json_path_term, normalize_exact, tokenize_ngram, tokenize_word, SchemaFields,
+    json_path_term, normalize_column_name, normalize_exact, tokenize_ngram, tokenize_word,
    SchemaFields,
 };
 use tantivy::query::{
    BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhraseQuery, Query, QueryParser,
@@ -48,7 +49,7 @@ pub fn build_master_query(
    let free_words = tokenize_word(free_query);
    if !free_words.is_empty() {
        let predicate = fuzzy_predicate_unscoped(index, fields, &free_words)?;
-        clauses.push((Occur::Should, predicate));
+        clauses.push((Occur::Must, predicate));
        has_search_clause = true;
    }
@@ -79,7 +80,8 @@ fn exact_predicate(
        ));
    }
-    let term = json_path_term(fields.data_exact, column, &normalized_value);
+    let column = normalize_column_name(column);
    let term = json_path_term(fields.data_exact, &column, &normalized_value);
    Ok(Box::new(TermQuery::new(term, IndexRecordOption::Basic)))
 }
@@ -95,11 +97,13 @@ fn fuzzy_predicate_scoped(
        ));
    }
    let column = normalize_column_name(column);
    let mut layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();
    let mut per_word_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
    for word in &words {
-        let term = json_path_term(fields.data_word, column, word);
+        let term = json_path_term(fields.data_word, &column, word);
        let mut alternates: Vec<(Occur, Box<dyn Query>)> = Vec::new();
        alternates.push((
@@ -136,7 +140,7 @@ fn fuzzy_predicate_scoped(
        let phrase_terms: Vec<(usize, Term)> = words
            .iter()
            .enumerate()
-            .map(|(offset, word)| (offset, json_path_term(fields.data_word, column, word)))
+            .map(|(offset, word)| (offset, json_path_term(fields.data_word, &column, word)))
            .collect();
        let phrase = PhraseQuery::new_with_offset_and_slop(phrase_terms, 3);
        layers.push((
@@ -150,7 +154,7 @@ fn fuzzy_predicate_scoped(
        let ngram_clauses: Vec<(Occur, Box<dyn Query>)> = ngrams
            .into_iter()
            .map(|gram| {
-                let term = json_path_term(fields.data_ngram, column, &gram);
+                let term = json_path_term(fields.data_ngram, &column, &gram);
                (
                    Occur::Must,
                    Box::new(TermQuery::new(term, IndexRecordOption::Basic)) as Box<dyn Query>,
@@ -176,35 +180,43 @@ fn fuzzy_predicate_unscoped(
 ) -> Result<Box<dyn Query>, Status> {
    let mut layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();
-    {
+    let mut per_word_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
-        let parser = QueryParser::for_index(index, vec![fields.data_word]);
+    for word in words {
-        let query_string = words
+        let term = Term::from_field_text(fields.all_text, word);
-            .iter()
+        let mut alternates: Vec<(Occur, Box<dyn Query>)> = Vec::new();
            .map(|word| format!("+{}*", word))
            .collect::<Vec<_>>()
            .join(" ");
        if let Ok(query) = parser.parse_query(&query_string) {
            layers.push((Occur::Should, Box::new(BoostQuery::new(query, 4.0))));
        }
    }
-    {
+        alternates.push((
-        let parser = QueryParser::for_index(index, vec![fields.data_word]);
+            Occur::Should,
-        let query_string = words
+            Box::new(BoostQuery::new(
-            .iter()
+                Box::new(TermQuery::new(term.clone(), IndexRecordOption::WithFreqs)),
-            .map(|word| match fuzzy_distance(word.chars().count()) {
+                4.0,
-                Some(distance) => format!("+{}~{}", word, distance),
+            )),
-                None => format!("+{}", word),
+        ));
-            })
+
-            .collect::<Vec<_>>()
+        alternates.push((
-            .join(" ");
+            Occur::Should,
-        if let Ok(query) = parser.parse_query(&query_string) {
+            Box::new(BoostQuery::new(
-            layers.push((Occur::Should, Box::new(BoostQuery::new(query, 2.0))));
+                Box::new(FuzzyTermQuery::new_prefix(term.clone(), 0, false)),
                3.0,
            )),
        ));
        if let Some(distance) = fuzzy_distance(word.chars().count()) {
            alternates.push((
                Occur::Should,
                Box::new(BoostQuery::new(
                    Box::new(FuzzyTermQuery::new(term, distance, true)),
                    2.0,
                )),
            ));
        }
        per_word_clauses.push((Occur::Must, Box::new(BooleanQuery::new(alternates))));
    }
    layers.push((Occur::Should, Box::new(BooleanQuery::new(per_word_clauses))));
    if words.len() > 1 {
-        let parser = QueryParser::for_index(index, vec![fields.data_word]);
+        let parser = QueryParser::for_index(index, vec![fields.all_text]);
        let query_string = format!("\"{}\"~3", words.join(" "));
        if let Ok(query) = parser.parse_query(&query_string) {
            layers.push((Occur::Should, Box::new(BoostQuery::new(query, 2.0))));
@@ -212,10 +224,10 @@ fn fuzzy_predicate_unscoped(
    }
    {
-        let parser = QueryParser::for_index(index, vec![fields.data_ngram]);
+        let parser = QueryParser::for_index(index, vec![fields.all_text]);
        let query_string = words
            .iter()
-            .map(|word| format!("+{}", word))
+            .map(|word| format!("+{}*", word))
            .collect::<Vec<_>>()
            .join(" ");
        if let Ok(query) = parser.parse_query(&query_string) {
--- a/2
+++ b/2
--- a/tantivy_todo.md
+++ b/tantivy_todo.md
@@ -0,0 +1,34 @@
  1. Add explicit reindex/backfill tooling.
     Right now, only future PostTableData / PutTableData calls index rows. There should be an admin/dev command like:
     ReindexProfile(profile_name)
     ReindexTable(profile_name, table_name)
     ReindexRow(profile_name, table_name, id)
     This is the biggest missing piece.
  2. Stop using relative ./tantivy_indexes.
     Both writer and reader depend on the process working directory. Make it config/env-driven, e.g.
     TANTIVY_INDEX_DIR.
  3. Add index schema/version metadata.
     If you change tokenizers/schema later, old indexes should fail with a clear “index version mismatch, reindex
     required” instead of behaving strangely.
  4. Batch index commits.
     Current code opens a writer and commits per row. Fine for dev, not great for many inserts. A long-lived writer
     task batching commits every N docs or every short interval would be more reliable and faster.
  5. Make the indexing queue durable.
     The current mpsc queue is in-memory. If the server crashes after DB insert but before indexing, search is stale.
     For serious use, store pending index jobs in Postgres, process them, mark done.
  6. Index only live rows intentionally.
     handle_add_or_update currently fetches row by id without checking deleted = false, then search filters deleted
     rows later. I’d either skip indexing deleted rows or make delete/update semantics explicit.
  7. Add typed fields for numbers/dates if you need range queries.
     Right now numbers are converted to strings. Good for text search, bad for real numeric filtering/sorting. Tantivy
     can do numeric/date fields, but JSON text fields are not enough for robust range search.
  8. Decide column-name strategy.
     Indexing lowercases raw DB JSON keys. If UI uses display names/aliases, column constraints can miss unless the
     frontend sends exactly what the index expects. I’d centralize display-name to physical-name mapping before
     search.
  9. Add delete hooks for table/profile deletion.
     When a table or profile is deleted, the matching Tantivy docs/index directory should be cleaned by code, not
     manually.