diff --git a/.gitignore b/.gitignore index 6ad69b4..1ba8016 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ steel_decimal/tests/property_tests.proptest-regressions canvas/*.toml .aider* .codex +TODO.md diff --git a/Cargo.lock b/Cargo.lock index 5ecdb0b..551131d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -493,7 +493,7 @@ checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" [[package]] name = "canvas" -version = "0.6.7" +version = "0.6.11" dependencies = [ "anyhow", "async-trait", @@ -586,7 +586,7 @@ dependencies = [ [[package]] name = "client" -version = "0.6.7" +version = "0.6.11" dependencies = [ "anyhow", "async-trait", @@ -642,7 +642,7 @@ dependencies = [ [[package]] name = "common" -version = "0.6.7" +version = "0.6.11" dependencies = [ "prost 0.13.5", "prost-build 0.14.1", @@ -3117,7 +3117,7 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" [[package]] name = "search" -version = "0.6.7" +version = "0.6.11" dependencies = [ "anyhow", "common", @@ -3216,7 +3216,7 @@ dependencies = [ [[package]] name = "server" -version = "0.6.7" +version = "0.6.11" dependencies = [ "anyhow", "bcrypt", @@ -4549,7 +4549,7 @@ dependencies = [ [[package]] name = "validation-core" -version = "0.6.7" +version = "0.6.11" dependencies = [ "regex", "serde", diff --git a/Cargo.toml b/Cargo.toml index 061de84..57f006a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ resolver = "2" [workspace.package] # TODO: idk how to do the name, fix later # name = "komp_ac" -version = "0.6.9" +version = "0.6.12" edition = "2021" license = "GPL-3.0-or-later" authors = ["Filip Priečinský "] diff --git a/client b/client index e97a6cf..4f8c712 160000 --- a/client +++ b/client @@ -1 +1 @@ -Subproject commit e97a6cfeaad062e80e0d7956adbf492c57fdab34 +Subproject commit 4f8c71274ac36a32cf61715f675da0e3acf80d3b diff --git a/common/src/search.rs b/common/src/search.rs index 98c40f9..b0244a4 100644 --- a/common/src/search.rs +++ b/common/src/search.rs @@ -1,8 +1,8 @@ use std::path::{Path, PathBuf}; use tantivy::schema::{ - Field, IndexRecordOption, JsonObjectOptions, Schema, Term, TextFieldIndexing, INDEXED, STORED, - STRING, + Field, IndexRecordOption, JsonObjectOptions, Schema, Term, TextFieldIndexing, TextOptions, + INDEXED, STORED, STRING, }; use tantivy::tokenizer::{ AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter, @@ -13,6 +13,7 @@ use tantivy::Index; pub const F_PG_ID: &str = "pg_id"; pub const F_TABLE_NAME: &str = "table_name"; pub const F_ROW_KEY: &str = "row_key"; +pub const F_ALL_TEXT: &str = "all_text"; pub const F_DATA_WORD: &str = "data_word"; pub const F_DATA_NGRAM: &str = "data_ngram"; pub const F_DATA_EXACT: &str = "data_exact"; @@ -59,6 +60,7 @@ pub fn create_search_schema() -> Schema { schema_builder.add_u64_field(F_PG_ID, INDEXED | STORED); schema_builder.add_text_field(F_TABLE_NAME, STRING | STORED); schema_builder.add_text_field(F_ROW_KEY, STRING | STORED); + schema_builder.add_text_field(F_ALL_TEXT, text_options(TOK_WORD)); schema_builder.add_json_field(F_DATA_WORD, json_options(TOK_WORD, true, false)); schema_builder.add_json_field(F_DATA_NGRAM, json_options(TOK_NGRAM, true, false)); @@ -67,6 +69,14 @@ pub fn create_search_schema() -> Schema { schema_builder.build() } +fn text_options(tokenizer_name: &str) -> TextOptions { + let indexing = TextFieldIndexing::default() + .set_tokenizer(tokenizer_name) + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + + TextOptions::default().set_indexing_options(indexing) +} + fn json_options(tokenizer_name: &str, with_positions: bool, stored: bool) -> JsonObjectOptions { let index_option = if with_positions { IndexRecordOption::WithFreqsAndPositions @@ -153,6 +163,7 @@ pub struct SchemaFields { pub pg_id: Field, pub table_name: Field, pub row_key: Field, + pub all_text: Field, pub data_word: Field, pub data_ngram: Field, pub data_exact: Field, @@ -164,6 +175,7 @@ impl SchemaFields { pg_id: get_field(schema, F_PG_ID)?, table_name: get_field(schema, F_TABLE_NAME)?, row_key: get_field(schema, F_ROW_KEY)?, + all_text: get_field(schema, F_ALL_TEXT)?, data_word: get_field(schema, F_DATA_WORD)?, data_ngram: get_field(schema, F_DATA_NGRAM)?, data_exact: get_field(schema, F_DATA_EXACT)?, diff --git a/search/src/lib.rs b/search/src/lib.rs index 8034594..50f9ca4 100644 --- a/search/src/lib.rs +++ b/search/src/lib.rs @@ -112,6 +112,7 @@ impl SearcherService { Ok(Response::new(SearchResponse { hits })) } + } struct ProfileIndex { @@ -133,7 +134,7 @@ impl ProfileIndex { .map_err(|e| Status::internal(format!("Failed to build index reader: {}", e)))?; let fields = SchemaFields::from(&index.schema()).map_err(|e| { Status::internal(format!( - "Search index schema mismatch. Reindex required: {}", + "Search index schema mismatch. Delete the stale index and create it again: {}", e )) })?; @@ -205,6 +206,22 @@ fn validate_identifier(value: &str, field_name: &str) -> Result<(), Status> { Ok(()) } +fn validate_search_column(value: &str) -> Result<(), Status> { + if value.is_empty() { + return Err(Status::invalid_argument( + "constraint.column must not be empty", + )); + } + + if value.chars().any(|ch| ch.is_control() || ch == '\0') { + return Err(Status::invalid_argument( + "constraint.column contains invalid characters", + )); + } + + Ok(()) +} + fn qualify_profile_table(profile_name: &str, table_name: &str) -> String { format!("\"{}\".\"{}\"", profile_name, table_name) } @@ -258,12 +275,7 @@ fn normalize_request(req: SearchRequest) -> Result)> = Vec::new(); let mut per_word_clauses: Vec<(Occur, Box)> = Vec::new(); for word in &words { - let term = json_path_term(fields.data_word, column, word); + let term = json_path_term(fields.data_word, &column, word); let mut alternates: Vec<(Occur, Box)> = Vec::new(); alternates.push(( @@ -136,7 +140,7 @@ fn fuzzy_predicate_scoped( let phrase_terms: Vec<(usize, Term)> = words .iter() .enumerate() - .map(|(offset, word)| (offset, json_path_term(fields.data_word, column, word))) + .map(|(offset, word)| (offset, json_path_term(fields.data_word, &column, word))) .collect(); let phrase = PhraseQuery::new_with_offset_and_slop(phrase_terms, 3); layers.push(( @@ -150,7 +154,7 @@ fn fuzzy_predicate_scoped( let ngram_clauses: Vec<(Occur, Box)> = ngrams .into_iter() .map(|gram| { - let term = json_path_term(fields.data_ngram, column, &gram); + let term = json_path_term(fields.data_ngram, &column, &gram); ( Occur::Must, Box::new(TermQuery::new(term, IndexRecordOption::Basic)) as Box, @@ -176,35 +180,43 @@ fn fuzzy_predicate_unscoped( ) -> Result, Status> { let mut layers: Vec<(Occur, Box)> = Vec::new(); - { - let parser = QueryParser::for_index(index, vec![fields.data_word]); - let query_string = words - .iter() - .map(|word| format!("+{}*", word)) - .collect::>() - .join(" "); - if let Ok(query) = parser.parse_query(&query_string) { - layers.push((Occur::Should, Box::new(BoostQuery::new(query, 4.0)))); - } - } + let mut per_word_clauses: Vec<(Occur, Box)> = Vec::new(); + for word in words { + let term = Term::from_field_text(fields.all_text, word); + let mut alternates: Vec<(Occur, Box)> = Vec::new(); - { - let parser = QueryParser::for_index(index, vec![fields.data_word]); - let query_string = words - .iter() - .map(|word| match fuzzy_distance(word.chars().count()) { - Some(distance) => format!("+{}~{}", word, distance), - None => format!("+{}", word), - }) - .collect::>() - .join(" "); - if let Ok(query) = parser.parse_query(&query_string) { - layers.push((Occur::Should, Box::new(BoostQuery::new(query, 2.0)))); + alternates.push(( + Occur::Should, + Box::new(BoostQuery::new( + Box::new(TermQuery::new(term.clone(), IndexRecordOption::WithFreqs)), + 4.0, + )), + )); + + alternates.push(( + Occur::Should, + Box::new(BoostQuery::new( + Box::new(FuzzyTermQuery::new_prefix(term.clone(), 0, false)), + 3.0, + )), + )); + + if let Some(distance) = fuzzy_distance(word.chars().count()) { + alternates.push(( + Occur::Should, + Box::new(BoostQuery::new( + Box::new(FuzzyTermQuery::new(term, distance, true)), + 2.0, + )), + )); } + + per_word_clauses.push((Occur::Must, Box::new(BooleanQuery::new(alternates)))); } + layers.push((Occur::Should, Box::new(BooleanQuery::new(per_word_clauses)))); if words.len() > 1 { - let parser = QueryParser::for_index(index, vec![fields.data_word]); + let parser = QueryParser::for_index(index, vec![fields.all_text]); let query_string = format!("\"{}\"~3", words.join(" ")); if let Ok(query) = parser.parse_query(&query_string) { layers.push((Occur::Should, Box::new(BoostQuery::new(query, 2.0)))); @@ -212,10 +224,10 @@ fn fuzzy_predicate_unscoped( } { - let parser = QueryParser::for_index(index, vec![fields.data_ngram]); + let parser = QueryParser::for_index(index, vec![fields.all_text]); let query_string = words .iter() - .map(|word| format!("+{}", word)) + .map(|word| format!("+{}*", word)) .collect::>() .join(" "); if let Ok(query) = parser.parse_query(&query_string) { diff --git a/server b/server index 527a053..2f933e4 160000 --- a/server +++ b/server @@ -1 +1 @@ -Subproject commit 527a053ab908b1911194db57b19e34917fd735e2 +Subproject commit 2f933e4e34e1417f2eeb1024f98a95e1c598b06f diff --git a/tantivy_todo.md b/tantivy_todo.md new file mode 100644 index 0000000..8a7fcb8 --- /dev/null +++ b/tantivy_todo.md @@ -0,0 +1,34 @@ + 1. Add explicit reindex/backfill tooling. + Right now, only future PostTableData / PutTableData calls index rows. There should be an admin/dev command like: + + ReindexProfile(profile_name) + ReindexTable(profile_name, table_name) + ReindexRow(profile_name, table_name, id) + + This is the biggest missing piece. + + 2. Stop using relative ./tantivy_indexes. + Both writer and reader depend on the process working directory. Make it config/env-driven, e.g. + TANTIVY_INDEX_DIR. + 3. Add index schema/version metadata. + If you change tokenizers/schema later, old indexes should fail with a clear “index version mismatch, reindex + required” instead of behaving strangely. + 4. Batch index commits. + Current code opens a writer and commits per row. Fine for dev, not great for many inserts. A long-lived writer + task batching commits every N docs or every short interval would be more reliable and faster. + 5. Make the indexing queue durable. + The current mpsc queue is in-memory. If the server crashes after DB insert but before indexing, search is stale. + For serious use, store pending index jobs in Postgres, process them, mark done. + 6. Index only live rows intentionally. + handle_add_or_update currently fetches row by id without checking deleted = false, then search filters deleted + rows later. I’d either skip indexing deleted rows or make delete/update semantics explicit. + 7. Add typed fields for numbers/dates if you need range queries. + Right now numbers are converted to strings. Good for text search, bad for real numeric filtering/sorting. Tantivy + can do numeric/date fields, but JSON text fields are not enough for robust range search. + 8. Decide column-name strategy. + Indexing lowercases raw DB JSON keys. If UI uses display names/aliases, column constraints can miss unless the + frontend sends exactly what the index expects. I’d centralize display-name to physical-name mapping before + search. + 9. Add delete hooks for table/profile deletion. + When a table or profile is deleted, the matching Tantivy docs/index directory should be cleaned by code, not + manually.