working v12

2026-05-17 13:10:44 +02:00
parent 6a87750329
commit dc273506b7
9 changed files with 121 additions and 50 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ steel_decimal/tests/property_tests.proptest-regressions
 canvas/*.toml
 .aider*
 .codex
+TODO.md
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -493,7 +493,7 @@ checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"

 [[package]]
 name = "canvas"
-version = "0.6.7"
+version = "0.6.11"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -586,7 +586,7 @@ dependencies = [

 [[package]]
 name = "client"
-version = "0.6.7"
+version = "0.6.11"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -642,7 +642,7 @@ dependencies = [

 [[package]]
 name = "common"
-version = "0.6.7"
+version = "0.6.11"
 dependencies = [
 "prost 0.13.5",
 "prost-build 0.14.1",
@@ -3117,7 +3117,7 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"

 [[package]]
 name = "search"
-version = "0.6.7"
+version = "0.6.11"
 dependencies = [
 "anyhow",
 "common",
@@ -3216,7 +3216,7 @@ dependencies = [

 [[package]]
 name = "server"
-version = "0.6.7"
+version = "0.6.11"
 dependencies = [
 "anyhow",
 "bcrypt",
@@ -4549,7 +4549,7 @@ dependencies = [

 [[package]]
 name = "validation-core"
-version = "0.6.7"
+version = "0.6.11"
 dependencies = [
 "regex",
 "serde",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,7 +5,7 @@ resolver = "2"
 [workspace.package]
 # TODO: idk how to do the name, fix later
 # name = "komp_ac"
-version = "0.6.9"
+version = "0.6.12"
 edition = "2021"
 license = "GPL-3.0-or-later"
 authors = ["Filip Priečinský <filippriec@gmail.com>"]
--- a/2
+++ b/2
--- a/common/src/search.rs
+++ b/common/src/search.rs
@@ -1,8 +1,8 @@
 use std::path::{Path, PathBuf};

 use tantivy::schema::{
-    Field, IndexRecordOption, JsonObjectOptions, Schema, Term, TextFieldIndexing, INDEXED, STORED,
-    STRING,
+    Field, IndexRecordOption, JsonObjectOptions, Schema, Term, TextFieldIndexing, TextOptions,
+    INDEXED, STORED, STRING,
 };
 use tantivy::tokenizer::{
    AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter,
@@ -13,6 +13,7 @@ use tantivy::Index;
 pub const F_PG_ID: &str = "pg_id";
 pub const F_TABLE_NAME: &str = "table_name";
 pub const F_ROW_KEY: &str = "row_key";
+pub const F_ALL_TEXT: &str = "all_text";
 pub const F_DATA_WORD: &str = "data_word";
 pub const F_DATA_NGRAM: &str = "data_ngram";
 pub const F_DATA_EXACT: &str = "data_exact";
@@ -59,6 +60,7 @@ pub fn create_search_schema() -> Schema {
    schema_builder.add_u64_field(F_PG_ID, INDEXED | STORED);
    schema_builder.add_text_field(F_TABLE_NAME, STRING | STORED);
    schema_builder.add_text_field(F_ROW_KEY, STRING | STORED);
+    schema_builder.add_text_field(F_ALL_TEXT, text_options(TOK_WORD));

    schema_builder.add_json_field(F_DATA_WORD, json_options(TOK_WORD, true, false));
    schema_builder.add_json_field(F_DATA_NGRAM, json_options(TOK_NGRAM, true, false));
@@ -67,6 +69,14 @@ pub fn create_search_schema() -> Schema {
    schema_builder.build()
 }

+fn text_options(tokenizer_name: &str) -> TextOptions {
+    let indexing = TextFieldIndexing::default()
+        .set_tokenizer(tokenizer_name)
+        .set_index_option(IndexRecordOption::WithFreqsAndPositions);
+
+    TextOptions::default().set_indexing_options(indexing)
+}
+
 fn json_options(tokenizer_name: &str, with_positions: bool, stored: bool) -> JsonObjectOptions {
    let index_option = if with_positions {
        IndexRecordOption::WithFreqsAndPositions
@@ -153,6 +163,7 @@ pub struct SchemaFields {
    pub pg_id: Field,
    pub table_name: Field,
    pub row_key: Field,
+    pub all_text: Field,
    pub data_word: Field,
    pub data_ngram: Field,
    pub data_exact: Field,
@@ -164,6 +175,7 @@ impl SchemaFields {
            pg_id: get_field(schema, F_PG_ID)?,
            table_name: get_field(schema, F_TABLE_NAME)?,
            row_key: get_field(schema, F_ROW_KEY)?,
+            all_text: get_field(schema, F_ALL_TEXT)?,
            data_word: get_field(schema, F_DATA_WORD)?,
            data_ngram: get_field(schema, F_DATA_NGRAM)?,
            data_exact: get_field(schema, F_DATA_EXACT)?,
--- a/search/src/lib.rs
+++ b/search/src/lib.rs
@@ -112,6 +112,7 @@ impl SearcherService {

        Ok(Response::new(SearchResponse { hits }))
    }
+
 }

 struct ProfileIndex {
@@ -133,7 +134,7 @@ impl ProfileIndex {
            .map_err(|e| Status::internal(format!("Failed to build index reader: {}", e)))?;
        let fields = SchemaFields::from(&index.schema()).map_err(|e| {
            Status::internal(format!(
-                "Search index schema mismatch. Reindex required: {}",
+                "Search index schema mismatch. Delete the stale index and create it again: {}",
                e
            ))
        })?;
@@ -205,6 +206,22 @@ fn validate_identifier(value: &str, field_name: &str) -> Result<(), Status> {
    Ok(())
 }

+fn validate_search_column(value: &str) -> Result<(), Status> {
+    if value.is_empty() {
+        return Err(Status::invalid_argument(
+            "constraint.column must not be empty",
+        ));
+    }
+
+    if value.chars().any(|ch| ch.is_control() || ch == '\0') {
+        return Err(Status::invalid_argument(
+            "constraint.column contains invalid characters",
+        ));
+    }
+
+    Ok(())
+}
+
 fn qualify_profile_table(profile_name: &str, table_name: &str) -> String {
    format!("\"{}\".\"{}\"", profile_name, table_name)
 }
@@ -258,12 +275,7 @@ fn normalize_request(req: SearchRequest) -> Result<NormalizedSearchRequest, Stat

    for constraint in req.must {
        let column = constraint.column.trim();
-        if column.is_empty() {
-            return Err(Status::invalid_argument(
-                "constraint.column must not be empty",
-            ));
-        }
-        validate_identifier(column, "constraint.column")?;
+        validate_search_column(column)?;

        let query = constraint.query.trim();
        if query.is_empty() {
--- a/search/src/query_builder.rs
+++ b/search/src/query_builder.rs
@@ -1,5 +1,6 @@
 use common::search::{
-    json_path_term, normalize_exact, tokenize_ngram, tokenize_word, SchemaFields,
+    json_path_term, normalize_column_name, normalize_exact, tokenize_ngram, tokenize_word,
+    SchemaFields,
 };
 use tantivy::query::{
    BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhraseQuery, Query, QueryParser,
@@ -48,7 +49,7 @@ pub fn build_master_query(
    let free_words = tokenize_word(free_query);
    if !free_words.is_empty() {
        let predicate = fuzzy_predicate_unscoped(index, fields, &free_words)?;
-        clauses.push((Occur::Should, predicate));
+        clauses.push((Occur::Must, predicate));
        has_search_clause = true;
    }

@@ -79,7 +80,8 @@ fn exact_predicate(
        ));
    }

-    let term = json_path_term(fields.data_exact, column, &normalized_value);
+    let column = normalize_column_name(column);
+    let term = json_path_term(fields.data_exact, &column, &normalized_value);
    Ok(Box::new(TermQuery::new(term, IndexRecordOption::Basic)))
 }

@@ -95,11 +97,13 @@ fn fuzzy_predicate_scoped(
        ));
    }

+    let column = normalize_column_name(column);
+
    let mut layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();

    let mut per_word_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
    for word in &words {
-        let term = json_path_term(fields.data_word, column, word);
+        let term = json_path_term(fields.data_word, &column, word);
        let mut alternates: Vec<(Occur, Box<dyn Query>)> = Vec::new();

        alternates.push((
@@ -136,7 +140,7 @@ fn fuzzy_predicate_scoped(
        let phrase_terms: Vec<(usize, Term)> = words
            .iter()
            .enumerate()
-            .map(|(offset, word)| (offset, json_path_term(fields.data_word, column, word)))
+            .map(|(offset, word)| (offset, json_path_term(fields.data_word, &column, word)))
            .collect();
        let phrase = PhraseQuery::new_with_offset_and_slop(phrase_terms, 3);
        layers.push((
@@ -150,7 +154,7 @@ fn fuzzy_predicate_scoped(
        let ngram_clauses: Vec<(Occur, Box<dyn Query>)> = ngrams
            .into_iter()
            .map(|gram| {
-                let term = json_path_term(fields.data_ngram, column, &gram);
+                let term = json_path_term(fields.data_ngram, &column, &gram);
                (
                    Occur::Must,
                    Box::new(TermQuery::new(term, IndexRecordOption::Basic)) as Box<dyn Query>,
@@ -176,35 +180,43 @@ fn fuzzy_predicate_unscoped(
 ) -> Result<Box<dyn Query>, Status> {
    let mut layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();

-    {
-        let parser = QueryParser::for_index(index, vec![fields.data_word]);
-        let query_string = words
-            .iter()
-            .map(|word| format!("+{}*", word))
-            .collect::<Vec<_>>()
-            .join(" ");
-        if let Ok(query) = parser.parse_query(&query_string) {
-            layers.push((Occur::Should, Box::new(BoostQuery::new(query, 4.0))));
-        }
-    }
+    let mut per_word_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
+    for word in words {
+        let term = Term::from_field_text(fields.all_text, word);
+        let mut alternates: Vec<(Occur, Box<dyn Query>)> = Vec::new();

-    {
-        let parser = QueryParser::for_index(index, vec![fields.data_word]);
-        let query_string = words
-            .iter()
-            .map(|word| match fuzzy_distance(word.chars().count()) {
-                Some(distance) => format!("+{}~{}", word, distance),
-                None => format!("+{}", word),
-            })
-            .collect::<Vec<_>>()
-            .join(" ");
-        if let Ok(query) = parser.parse_query(&query_string) {
-            layers.push((Occur::Should, Box::new(BoostQuery::new(query, 2.0))));
+        alternates.push((
+            Occur::Should,
+            Box::new(BoostQuery::new(
+                Box::new(TermQuery::new(term.clone(), IndexRecordOption::WithFreqs)),
+                4.0,
+            )),
+        ));
+
+        alternates.push((
+            Occur::Should,
+            Box::new(BoostQuery::new(
+                Box::new(FuzzyTermQuery::new_prefix(term.clone(), 0, false)),
+                3.0,
+            )),
+        ));
+
+        if let Some(distance) = fuzzy_distance(word.chars().count()) {
+            alternates.push((
+                Occur::Should,
+                Box::new(BoostQuery::new(
+                    Box::new(FuzzyTermQuery::new(term, distance, true)),
+                    2.0,
+                )),
+            ));
        }
+
+        per_word_clauses.push((Occur::Must, Box::new(BooleanQuery::new(alternates))));
    }
+    layers.push((Occur::Should, Box::new(BooleanQuery::new(per_word_clauses))));

    if words.len() > 1 {
-        let parser = QueryParser::for_index(index, vec![fields.data_word]);
+        let parser = QueryParser::for_index(index, vec![fields.all_text]);
        let query_string = format!("\"{}\"~3", words.join(" "));
        if let Ok(query) = parser.parse_query(&query_string) {
            layers.push((Occur::Should, Box::new(BoostQuery::new(query, 2.0))));
@@ -212,10 +224,10 @@ fn fuzzy_predicate_unscoped(
    }

    {
-        let parser = QueryParser::for_index(index, vec![fields.data_ngram]);
+        let parser = QueryParser::for_index(index, vec![fields.all_text]);
        let query_string = words
            .iter()
-            .map(|word| format!("+{}", word))
+            .map(|word| format!("+{}*", word))
            .collect::<Vec<_>>()
            .join(" ");
        if let Ok(query) = parser.parse_query(&query_string) {
--- a/2
+++ b/2
--- a/tantivy_todo.md
+++ b/tantivy_todo.md
@@ -0,0 +1,34 @@
+  1. Add explicit reindex/backfill tooling.
+     Right now, only future PostTableData / PutTableData calls index rows. There should be an admin/dev command like:
+
+     ReindexProfile(profile_name)
+     ReindexTable(profile_name, table_name)
+     ReindexRow(profile_name, table_name, id)
+
+     This is the biggest missing piece.
+
+  2. Stop using relative ./tantivy_indexes.
+     Both writer and reader depend on the process working directory. Make it config/env-driven, e.g.
+     TANTIVY_INDEX_DIR.
+  3. Add index schema/version metadata.
+     If you change tokenizers/schema later, old indexes should fail with a clear “index version mismatch, reindex
+     required” instead of behaving strangely.
+  4. Batch index commits.
+     Current code opens a writer and commits per row. Fine for dev, not great for many inserts. A long-lived writer
+     task batching commits every N docs or every short interval would be more reliable and faster.
+  5. Make the indexing queue durable.
+     The current mpsc queue is in-memory. If the server crashes after DB insert but before indexing, search is stale.
+     For serious use, store pending index jobs in Postgres, process them, mark done.
+  6. Index only live rows intentionally.
+     handle_add_or_update currently fetches row by id without checking deleted = false, then search filters deleted
+     rows later. I’d either skip indexing deleted rows or make delete/update semantics explicit.
+  7. Add typed fields for numbers/dates if you need range queries.
+     Right now numbers are converted to strings. Good for text search, bad for real numeric filtering/sorting. Tantivy
+     can do numeric/date fields, but JSON text fields are not enough for robust range search.
+  8. Decide column-name strategy.
+     Indexing lowercases raw DB JSON keys. If UI uses display names/aliases, column constraints can miss unless the
+     frontend sends exactly what the index expects. I’d centralize display-name to physical-name mapping before
+     search.
+  9. Add delete hooks for table/profile deletion.
+     When a table or profile is deleted, the matching Tantivy docs/index directory should be cleaned by code, not
+     manually.