working v12

This commit is contained in:
Priec
2026-05-17 13:10:44 +02:00
parent 6a87750329
commit dc273506b7
9 changed files with 121 additions and 50 deletions

1
.gitignore vendored
View File

@@ -7,3 +7,4 @@ steel_decimal/tests/property_tests.proptest-regressions
canvas/*.toml canvas/*.toml
.aider* .aider*
.codex .codex
TODO.md

12
Cargo.lock generated
View File

@@ -493,7 +493,7 @@ checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
[[package]] [[package]]
name = "canvas" name = "canvas"
version = "0.6.7" version = "0.6.11"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-trait", "async-trait",
@@ -586,7 +586,7 @@ dependencies = [
[[package]] [[package]]
name = "client" name = "client"
version = "0.6.7" version = "0.6.11"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-trait", "async-trait",
@@ -642,7 +642,7 @@ dependencies = [
[[package]] [[package]]
name = "common" name = "common"
version = "0.6.7" version = "0.6.11"
dependencies = [ dependencies = [
"prost 0.13.5", "prost 0.13.5",
"prost-build 0.14.1", "prost-build 0.14.1",
@@ -3117,7 +3117,7 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
[[package]] [[package]]
name = "search" name = "search"
version = "0.6.7" version = "0.6.11"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"common", "common",
@@ -3216,7 +3216,7 @@ dependencies = [
[[package]] [[package]]
name = "server" name = "server"
version = "0.6.7" version = "0.6.11"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bcrypt", "bcrypt",
@@ -4549,7 +4549,7 @@ dependencies = [
[[package]] [[package]]
name = "validation-core" name = "validation-core"
version = "0.6.7" version = "0.6.11"
dependencies = [ dependencies = [
"regex", "regex",
"serde", "serde",

View File

@@ -5,7 +5,7 @@ resolver = "2"
[workspace.package] [workspace.package]
# TODO: idk how to do the name, fix later # TODO: idk how to do the name, fix later
# name = "komp_ac" # name = "komp_ac"
version = "0.6.9" version = "0.6.12"
edition = "2021" edition = "2021"
license = "GPL-3.0-or-later" license = "GPL-3.0-or-later"
authors = ["Filip Priečinský <filippriec@gmail.com>"] authors = ["Filip Priečinský <filippriec@gmail.com>"]

2
client

Submodule client updated: e97a6cfeaa...4f8c71274a

View File

@@ -1,8 +1,8 @@
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use tantivy::schema::{ use tantivy::schema::{
Field, IndexRecordOption, JsonObjectOptions, Schema, Term, TextFieldIndexing, INDEXED, STORED, Field, IndexRecordOption, JsonObjectOptions, Schema, Term, TextFieldIndexing, TextOptions,
STRING, INDEXED, STORED, STRING,
}; };
use tantivy::tokenizer::{ use tantivy::tokenizer::{
AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter, AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter,
@@ -13,6 +13,7 @@ use tantivy::Index;
pub const F_PG_ID: &str = "pg_id"; pub const F_PG_ID: &str = "pg_id";
pub const F_TABLE_NAME: &str = "table_name"; pub const F_TABLE_NAME: &str = "table_name";
pub const F_ROW_KEY: &str = "row_key"; pub const F_ROW_KEY: &str = "row_key";
pub const F_ALL_TEXT: &str = "all_text";
pub const F_DATA_WORD: &str = "data_word"; pub const F_DATA_WORD: &str = "data_word";
pub const F_DATA_NGRAM: &str = "data_ngram"; pub const F_DATA_NGRAM: &str = "data_ngram";
pub const F_DATA_EXACT: &str = "data_exact"; pub const F_DATA_EXACT: &str = "data_exact";
@@ -59,6 +60,7 @@ pub fn create_search_schema() -> Schema {
schema_builder.add_u64_field(F_PG_ID, INDEXED | STORED); schema_builder.add_u64_field(F_PG_ID, INDEXED | STORED);
schema_builder.add_text_field(F_TABLE_NAME, STRING | STORED); schema_builder.add_text_field(F_TABLE_NAME, STRING | STORED);
schema_builder.add_text_field(F_ROW_KEY, STRING | STORED); schema_builder.add_text_field(F_ROW_KEY, STRING | STORED);
schema_builder.add_text_field(F_ALL_TEXT, text_options(TOK_WORD));
schema_builder.add_json_field(F_DATA_WORD, json_options(TOK_WORD, true, false)); schema_builder.add_json_field(F_DATA_WORD, json_options(TOK_WORD, true, false));
schema_builder.add_json_field(F_DATA_NGRAM, json_options(TOK_NGRAM, true, false)); schema_builder.add_json_field(F_DATA_NGRAM, json_options(TOK_NGRAM, true, false));
@@ -67,6 +69,14 @@ pub fn create_search_schema() -> Schema {
schema_builder.build() schema_builder.build()
} }
fn text_options(tokenizer_name: &str) -> TextOptions {
let indexing = TextFieldIndexing::default()
.set_tokenizer(tokenizer_name)
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
TextOptions::default().set_indexing_options(indexing)
}
fn json_options(tokenizer_name: &str, with_positions: bool, stored: bool) -> JsonObjectOptions { fn json_options(tokenizer_name: &str, with_positions: bool, stored: bool) -> JsonObjectOptions {
let index_option = if with_positions { let index_option = if with_positions {
IndexRecordOption::WithFreqsAndPositions IndexRecordOption::WithFreqsAndPositions
@@ -153,6 +163,7 @@ pub struct SchemaFields {
pub pg_id: Field, pub pg_id: Field,
pub table_name: Field, pub table_name: Field,
pub row_key: Field, pub row_key: Field,
pub all_text: Field,
pub data_word: Field, pub data_word: Field,
pub data_ngram: Field, pub data_ngram: Field,
pub data_exact: Field, pub data_exact: Field,
@@ -164,6 +175,7 @@ impl SchemaFields {
pg_id: get_field(schema, F_PG_ID)?, pg_id: get_field(schema, F_PG_ID)?,
table_name: get_field(schema, F_TABLE_NAME)?, table_name: get_field(schema, F_TABLE_NAME)?,
row_key: get_field(schema, F_ROW_KEY)?, row_key: get_field(schema, F_ROW_KEY)?,
all_text: get_field(schema, F_ALL_TEXT)?,
data_word: get_field(schema, F_DATA_WORD)?, data_word: get_field(schema, F_DATA_WORD)?,
data_ngram: get_field(schema, F_DATA_NGRAM)?, data_ngram: get_field(schema, F_DATA_NGRAM)?,
data_exact: get_field(schema, F_DATA_EXACT)?, data_exact: get_field(schema, F_DATA_EXACT)?,

View File

@@ -112,6 +112,7 @@ impl SearcherService {
Ok(Response::new(SearchResponse { hits })) Ok(Response::new(SearchResponse { hits }))
} }
} }
struct ProfileIndex { struct ProfileIndex {
@@ -133,7 +134,7 @@ impl ProfileIndex {
.map_err(|e| Status::internal(format!("Failed to build index reader: {}", e)))?; .map_err(|e| Status::internal(format!("Failed to build index reader: {}", e)))?;
let fields = SchemaFields::from(&index.schema()).map_err(|e| { let fields = SchemaFields::from(&index.schema()).map_err(|e| {
Status::internal(format!( Status::internal(format!(
"Search index schema mismatch. Reindex required: {}", "Search index schema mismatch. Delete the stale index and create it again: {}",
e e
)) ))
})?; })?;
@@ -205,6 +206,22 @@ fn validate_identifier(value: &str, field_name: &str) -> Result<(), Status> {
Ok(()) Ok(())
} }
fn validate_search_column(value: &str) -> Result<(), Status> {
if value.is_empty() {
return Err(Status::invalid_argument(
"constraint.column must not be empty",
));
}
if value.chars().any(|ch| ch.is_control() || ch == '\0') {
return Err(Status::invalid_argument(
"constraint.column contains invalid characters",
));
}
Ok(())
}
fn qualify_profile_table(profile_name: &str, table_name: &str) -> String { fn qualify_profile_table(profile_name: &str, table_name: &str) -> String {
format!("\"{}\".\"{}\"", profile_name, table_name) format!("\"{}\".\"{}\"", profile_name, table_name)
} }
@@ -258,12 +275,7 @@ fn normalize_request(req: SearchRequest) -> Result<NormalizedSearchRequest, Stat
for constraint in req.must { for constraint in req.must {
let column = constraint.column.trim(); let column = constraint.column.trim();
if column.is_empty() { validate_search_column(column)?;
return Err(Status::invalid_argument(
"constraint.column must not be empty",
));
}
validate_identifier(column, "constraint.column")?;
let query = constraint.query.trim(); let query = constraint.query.trim();
if query.is_empty() { if query.is_empty() {

View File

@@ -1,5 +1,6 @@
use common::search::{ use common::search::{
json_path_term, normalize_exact, tokenize_ngram, tokenize_word, SchemaFields, json_path_term, normalize_column_name, normalize_exact, tokenize_ngram, tokenize_word,
SchemaFields,
}; };
use tantivy::query::{ use tantivy::query::{
BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhraseQuery, Query, QueryParser, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhraseQuery, Query, QueryParser,
@@ -48,7 +49,7 @@ pub fn build_master_query(
let free_words = tokenize_word(free_query); let free_words = tokenize_word(free_query);
if !free_words.is_empty() { if !free_words.is_empty() {
let predicate = fuzzy_predicate_unscoped(index, fields, &free_words)?; let predicate = fuzzy_predicate_unscoped(index, fields, &free_words)?;
clauses.push((Occur::Should, predicate)); clauses.push((Occur::Must, predicate));
has_search_clause = true; has_search_clause = true;
} }
@@ -79,7 +80,8 @@ fn exact_predicate(
)); ));
} }
let term = json_path_term(fields.data_exact, column, &normalized_value); let column = normalize_column_name(column);
let term = json_path_term(fields.data_exact, &column, &normalized_value);
Ok(Box::new(TermQuery::new(term, IndexRecordOption::Basic))) Ok(Box::new(TermQuery::new(term, IndexRecordOption::Basic)))
} }
@@ -95,11 +97,13 @@ fn fuzzy_predicate_scoped(
)); ));
} }
let column = normalize_column_name(column);
let mut layers: Vec<(Occur, Box<dyn Query>)> = Vec::new(); let mut layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();
let mut per_word_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new(); let mut per_word_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
for word in &words { for word in &words {
let term = json_path_term(fields.data_word, column, word); let term = json_path_term(fields.data_word, &column, word);
let mut alternates: Vec<(Occur, Box<dyn Query>)> = Vec::new(); let mut alternates: Vec<(Occur, Box<dyn Query>)> = Vec::new();
alternates.push(( alternates.push((
@@ -136,7 +140,7 @@ fn fuzzy_predicate_scoped(
let phrase_terms: Vec<(usize, Term)> = words let phrase_terms: Vec<(usize, Term)> = words
.iter() .iter()
.enumerate() .enumerate()
.map(|(offset, word)| (offset, json_path_term(fields.data_word, column, word))) .map(|(offset, word)| (offset, json_path_term(fields.data_word, &column, word)))
.collect(); .collect();
let phrase = PhraseQuery::new_with_offset_and_slop(phrase_terms, 3); let phrase = PhraseQuery::new_with_offset_and_slop(phrase_terms, 3);
layers.push(( layers.push((
@@ -150,7 +154,7 @@ fn fuzzy_predicate_scoped(
let ngram_clauses: Vec<(Occur, Box<dyn Query>)> = ngrams let ngram_clauses: Vec<(Occur, Box<dyn Query>)> = ngrams
.into_iter() .into_iter()
.map(|gram| { .map(|gram| {
let term = json_path_term(fields.data_ngram, column, &gram); let term = json_path_term(fields.data_ngram, &column, &gram);
( (
Occur::Must, Occur::Must,
Box::new(TermQuery::new(term, IndexRecordOption::Basic)) as Box<dyn Query>, Box::new(TermQuery::new(term, IndexRecordOption::Basic)) as Box<dyn Query>,
@@ -176,35 +180,43 @@ fn fuzzy_predicate_unscoped(
) -> Result<Box<dyn Query>, Status> { ) -> Result<Box<dyn Query>, Status> {
let mut layers: Vec<(Occur, Box<dyn Query>)> = Vec::new(); let mut layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();
{ let mut per_word_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
let parser = QueryParser::for_index(index, vec![fields.data_word]); for word in words {
let query_string = words let term = Term::from_field_text(fields.all_text, word);
.iter() let mut alternates: Vec<(Occur, Box<dyn Query>)> = Vec::new();
.map(|word| format!("+{}*", word))
.collect::<Vec<_>>()
.join(" ");
if let Ok(query) = parser.parse_query(&query_string) {
layers.push((Occur::Should, Box::new(BoostQuery::new(query, 4.0))));
}
}
{ alternates.push((
let parser = QueryParser::for_index(index, vec![fields.data_word]); Occur::Should,
let query_string = words Box::new(BoostQuery::new(
.iter() Box::new(TermQuery::new(term.clone(), IndexRecordOption::WithFreqs)),
.map(|word| match fuzzy_distance(word.chars().count()) { 4.0,
Some(distance) => format!("+{}~{}", word, distance), )),
None => format!("+{}", word), ));
})
.collect::<Vec<_>>() alternates.push((
.join(" "); Occur::Should,
if let Ok(query) = parser.parse_query(&query_string) { Box::new(BoostQuery::new(
layers.push((Occur::Should, Box::new(BoostQuery::new(query, 2.0)))); Box::new(FuzzyTermQuery::new_prefix(term.clone(), 0, false)),
3.0,
)),
));
if let Some(distance) = fuzzy_distance(word.chars().count()) {
alternates.push((
Occur::Should,
Box::new(BoostQuery::new(
Box::new(FuzzyTermQuery::new(term, distance, true)),
2.0,
)),
));
} }
per_word_clauses.push((Occur::Must, Box::new(BooleanQuery::new(alternates))));
} }
layers.push((Occur::Should, Box::new(BooleanQuery::new(per_word_clauses))));
if words.len() > 1 { if words.len() > 1 {
let parser = QueryParser::for_index(index, vec![fields.data_word]); let parser = QueryParser::for_index(index, vec![fields.all_text]);
let query_string = format!("\"{}\"~3", words.join(" ")); let query_string = format!("\"{}\"~3", words.join(" "));
if let Ok(query) = parser.parse_query(&query_string) { if let Ok(query) = parser.parse_query(&query_string) {
layers.push((Occur::Should, Box::new(BoostQuery::new(query, 2.0)))); layers.push((Occur::Should, Box::new(BoostQuery::new(query, 2.0))));
@@ -212,10 +224,10 @@ fn fuzzy_predicate_unscoped(
} }
{ {
let parser = QueryParser::for_index(index, vec![fields.data_ngram]); let parser = QueryParser::for_index(index, vec![fields.all_text]);
let query_string = words let query_string = words
.iter() .iter()
.map(|word| format!("+{}", word)) .map(|word| format!("+{}*", word))
.collect::<Vec<_>>() .collect::<Vec<_>>()
.join(" "); .join(" ");
if let Ok(query) = parser.parse_query(&query_string) { if let Ok(query) = parser.parse_query(&query_string) {

2
server

Submodule server updated: 527a053ab9...2f933e4e34

34
tantivy_todo.md Normal file
View File

@@ -0,0 +1,34 @@
1. Add explicit reindex/backfill tooling.
Right now, only future PostTableData / PutTableData calls index rows. There should be an admin/dev command like:
ReindexProfile(profile_name)
ReindexTable(profile_name, table_name)
ReindexRow(profile_name, table_name, id)
This is the biggest missing piece.
2. Stop using relative ./tantivy_indexes.
Both writer and reader depend on the process working directory. Make it config/env-driven, e.g.
TANTIVY_INDEX_DIR.
3. Add index schema/version metadata.
If you change tokenizers/schema later, old indexes should fail with a clear “index version mismatch, reindex
required” instead of behaving strangely.
4. Batch index commits.
Current code opens a writer and commits per row. Fine for dev, not great for many inserts. A long-lived writer
task batching commits every N docs or every short interval would be more reliable and faster.
5. Make the indexing queue durable.
The current mpsc queue is in-memory. If the server crashes after DB insert but before indexing, search is stale.
For serious use, store pending index jobs in Postgres, process them, mark done.
6. Index only live rows intentionally.
handle_add_or_update currently fetches row by id without checking deleted = false, then search filters deleted
rows later. Id either skip indexing deleted rows or make delete/update semantics explicit.
7. Add typed fields for numbers/dates if you need range queries.
Right now numbers are converted to strings. Good for text search, bad for real numeric filtering/sorting. Tantivy
can do numeric/date fields, but JSON text fields are not enough for robust range search.
8. Decide column-name strategy.
Indexing lowercases raw DB JSON keys. If UI uses display names/aliases, column constraints can miss unless the
frontend sends exactly what the index expects. Id centralize display-name to physical-name mapping before
search.
9. Add delete hooks for table/profile deletion.
When a table or profile is deleted, the matching Tantivy docs/index directory should be cleaned by code, not
manually.