working v12
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -7,3 +7,4 @@ steel_decimal/tests/property_tests.proptest-regressions
|
||||
canvas/*.toml
|
||||
.aider*
|
||||
.codex
|
||||
TODO.md
|
||||
|
||||
12
Cargo.lock
generated
12
Cargo.lock
generated
@@ -493,7 +493,7 @@ checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
|
||||
|
||||
[[package]]
|
||||
name = "canvas"
|
||||
version = "0.6.7"
|
||||
version = "0.6.11"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@@ -586,7 +586,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "client"
|
||||
version = "0.6.7"
|
||||
version = "0.6.11"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@@ -642,7 +642,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common"
|
||||
version = "0.6.7"
|
||||
version = "0.6.11"
|
||||
dependencies = [
|
||||
"prost 0.13.5",
|
||||
"prost-build 0.14.1",
|
||||
@@ -3117,7 +3117,7 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
|
||||
|
||||
[[package]]
|
||||
name = "search"
|
||||
version = "0.6.7"
|
||||
version = "0.6.11"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"common",
|
||||
@@ -3216,7 +3216,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "server"
|
||||
version = "0.6.7"
|
||||
version = "0.6.11"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bcrypt",
|
||||
@@ -4549,7 +4549,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "validation-core"
|
||||
version = "0.6.7"
|
||||
version = "0.6.11"
|
||||
dependencies = [
|
||||
"regex",
|
||||
"serde",
|
||||
|
||||
@@ -5,7 +5,7 @@ resolver = "2"
|
||||
[workspace.package]
|
||||
# TODO: idk how to do the name, fix later
|
||||
# name = "komp_ac"
|
||||
version = "0.6.9"
|
||||
version = "0.6.12"
|
||||
edition = "2021"
|
||||
license = "GPL-3.0-or-later"
|
||||
authors = ["Filip Priečinský <filippriec@gmail.com>"]
|
||||
|
||||
2
client
2
client
Submodule client updated: e97a6cfeaa...4f8c71274a
@@ -1,8 +1,8 @@
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use tantivy::schema::{
|
||||
Field, IndexRecordOption, JsonObjectOptions, Schema, Term, TextFieldIndexing, INDEXED, STORED,
|
||||
STRING,
|
||||
Field, IndexRecordOption, JsonObjectOptions, Schema, Term, TextFieldIndexing, TextOptions,
|
||||
INDEXED, STORED, STRING,
|
||||
};
|
||||
use tantivy::tokenizer::{
|
||||
AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter,
|
||||
@@ -13,6 +13,7 @@ use tantivy::Index;
|
||||
pub const F_PG_ID: &str = "pg_id";
|
||||
pub const F_TABLE_NAME: &str = "table_name";
|
||||
pub const F_ROW_KEY: &str = "row_key";
|
||||
pub const F_ALL_TEXT: &str = "all_text";
|
||||
pub const F_DATA_WORD: &str = "data_word";
|
||||
pub const F_DATA_NGRAM: &str = "data_ngram";
|
||||
pub const F_DATA_EXACT: &str = "data_exact";
|
||||
@@ -59,6 +60,7 @@ pub fn create_search_schema() -> Schema {
|
||||
schema_builder.add_u64_field(F_PG_ID, INDEXED | STORED);
|
||||
schema_builder.add_text_field(F_TABLE_NAME, STRING | STORED);
|
||||
schema_builder.add_text_field(F_ROW_KEY, STRING | STORED);
|
||||
schema_builder.add_text_field(F_ALL_TEXT, text_options(TOK_WORD));
|
||||
|
||||
schema_builder.add_json_field(F_DATA_WORD, json_options(TOK_WORD, true, false));
|
||||
schema_builder.add_json_field(F_DATA_NGRAM, json_options(TOK_NGRAM, true, false));
|
||||
@@ -67,6 +69,14 @@ pub fn create_search_schema() -> Schema {
|
||||
schema_builder.build()
|
||||
}
|
||||
|
||||
fn text_options(tokenizer_name: &str) -> TextOptions {
|
||||
let indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer(tokenizer_name)
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
|
||||
TextOptions::default().set_indexing_options(indexing)
|
||||
}
|
||||
|
||||
fn json_options(tokenizer_name: &str, with_positions: bool, stored: bool) -> JsonObjectOptions {
|
||||
let index_option = if with_positions {
|
||||
IndexRecordOption::WithFreqsAndPositions
|
||||
@@ -153,6 +163,7 @@ pub struct SchemaFields {
|
||||
pub pg_id: Field,
|
||||
pub table_name: Field,
|
||||
pub row_key: Field,
|
||||
pub all_text: Field,
|
||||
pub data_word: Field,
|
||||
pub data_ngram: Field,
|
||||
pub data_exact: Field,
|
||||
@@ -164,6 +175,7 @@ impl SchemaFields {
|
||||
pg_id: get_field(schema, F_PG_ID)?,
|
||||
table_name: get_field(schema, F_TABLE_NAME)?,
|
||||
row_key: get_field(schema, F_ROW_KEY)?,
|
||||
all_text: get_field(schema, F_ALL_TEXT)?,
|
||||
data_word: get_field(schema, F_DATA_WORD)?,
|
||||
data_ngram: get_field(schema, F_DATA_NGRAM)?,
|
||||
data_exact: get_field(schema, F_DATA_EXACT)?,
|
||||
|
||||
@@ -112,6 +112,7 @@ impl SearcherService {
|
||||
|
||||
Ok(Response::new(SearchResponse { hits }))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
struct ProfileIndex {
|
||||
@@ -133,7 +134,7 @@ impl ProfileIndex {
|
||||
.map_err(|e| Status::internal(format!("Failed to build index reader: {}", e)))?;
|
||||
let fields = SchemaFields::from(&index.schema()).map_err(|e| {
|
||||
Status::internal(format!(
|
||||
"Search index schema mismatch. Reindex required: {}",
|
||||
"Search index schema mismatch. Delete the stale index and create it again: {}",
|
||||
e
|
||||
))
|
||||
})?;
|
||||
@@ -205,6 +206,22 @@ fn validate_identifier(value: &str, field_name: &str) -> Result<(), Status> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn validate_search_column(value: &str) -> Result<(), Status> {
|
||||
if value.is_empty() {
|
||||
return Err(Status::invalid_argument(
|
||||
"constraint.column must not be empty",
|
||||
));
|
||||
}
|
||||
|
||||
if value.chars().any(|ch| ch.is_control() || ch == '\0') {
|
||||
return Err(Status::invalid_argument(
|
||||
"constraint.column contains invalid characters",
|
||||
));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn qualify_profile_table(profile_name: &str, table_name: &str) -> String {
|
||||
format!("\"{}\".\"{}\"", profile_name, table_name)
|
||||
}
|
||||
@@ -258,12 +275,7 @@ fn normalize_request(req: SearchRequest) -> Result<NormalizedSearchRequest, Stat
|
||||
|
||||
for constraint in req.must {
|
||||
let column = constraint.column.trim();
|
||||
if column.is_empty() {
|
||||
return Err(Status::invalid_argument(
|
||||
"constraint.column must not be empty",
|
||||
));
|
||||
}
|
||||
validate_identifier(column, "constraint.column")?;
|
||||
validate_search_column(column)?;
|
||||
|
||||
let query = constraint.query.trim();
|
||||
if query.is_empty() {
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use common::search::{
|
||||
json_path_term, normalize_exact, tokenize_ngram, tokenize_word, SchemaFields,
|
||||
json_path_term, normalize_column_name, normalize_exact, tokenize_ngram, tokenize_word,
|
||||
SchemaFields,
|
||||
};
|
||||
use tantivy::query::{
|
||||
BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhraseQuery, Query, QueryParser,
|
||||
@@ -48,7 +49,7 @@ pub fn build_master_query(
|
||||
let free_words = tokenize_word(free_query);
|
||||
if !free_words.is_empty() {
|
||||
let predicate = fuzzy_predicate_unscoped(index, fields, &free_words)?;
|
||||
clauses.push((Occur::Should, predicate));
|
||||
clauses.push((Occur::Must, predicate));
|
||||
has_search_clause = true;
|
||||
}
|
||||
|
||||
@@ -79,7 +80,8 @@ fn exact_predicate(
|
||||
));
|
||||
}
|
||||
|
||||
let term = json_path_term(fields.data_exact, column, &normalized_value);
|
||||
let column = normalize_column_name(column);
|
||||
let term = json_path_term(fields.data_exact, &column, &normalized_value);
|
||||
Ok(Box::new(TermQuery::new(term, IndexRecordOption::Basic)))
|
||||
}
|
||||
|
||||
@@ -95,11 +97,13 @@ fn fuzzy_predicate_scoped(
|
||||
));
|
||||
}
|
||||
|
||||
let column = normalize_column_name(column);
|
||||
|
||||
let mut layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
||||
|
||||
let mut per_word_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
||||
for word in &words {
|
||||
let term = json_path_term(fields.data_word, column, word);
|
||||
let term = json_path_term(fields.data_word, &column, word);
|
||||
let mut alternates: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
||||
|
||||
alternates.push((
|
||||
@@ -136,7 +140,7 @@ fn fuzzy_predicate_scoped(
|
||||
let phrase_terms: Vec<(usize, Term)> = words
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(offset, word)| (offset, json_path_term(fields.data_word, column, word)))
|
||||
.map(|(offset, word)| (offset, json_path_term(fields.data_word, &column, word)))
|
||||
.collect();
|
||||
let phrase = PhraseQuery::new_with_offset_and_slop(phrase_terms, 3);
|
||||
layers.push((
|
||||
@@ -150,7 +154,7 @@ fn fuzzy_predicate_scoped(
|
||||
let ngram_clauses: Vec<(Occur, Box<dyn Query>)> = ngrams
|
||||
.into_iter()
|
||||
.map(|gram| {
|
||||
let term = json_path_term(fields.data_ngram, column, &gram);
|
||||
let term = json_path_term(fields.data_ngram, &column, &gram);
|
||||
(
|
||||
Occur::Must,
|
||||
Box::new(TermQuery::new(term, IndexRecordOption::Basic)) as Box<dyn Query>,
|
||||
@@ -176,35 +180,43 @@ fn fuzzy_predicate_unscoped(
|
||||
) -> Result<Box<dyn Query>, Status> {
|
||||
let mut layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
||||
|
||||
{
|
||||
let parser = QueryParser::for_index(index, vec![fields.data_word]);
|
||||
let query_string = words
|
||||
.iter()
|
||||
.map(|word| format!("+{}*", word))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ");
|
||||
if let Ok(query) = parser.parse_query(&query_string) {
|
||||
layers.push((Occur::Should, Box::new(BoostQuery::new(query, 4.0))));
|
||||
}
|
||||
}
|
||||
let mut per_word_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
||||
for word in words {
|
||||
let term = Term::from_field_text(fields.all_text, word);
|
||||
let mut alternates: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
||||
|
||||
{
|
||||
let parser = QueryParser::for_index(index, vec![fields.data_word]);
|
||||
let query_string = words
|
||||
.iter()
|
||||
.map(|word| match fuzzy_distance(word.chars().count()) {
|
||||
Some(distance) => format!("+{}~{}", word, distance),
|
||||
None => format!("+{}", word),
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ");
|
||||
if let Ok(query) = parser.parse_query(&query_string) {
|
||||
layers.push((Occur::Should, Box::new(BoostQuery::new(query, 2.0))));
|
||||
alternates.push((
|
||||
Occur::Should,
|
||||
Box::new(BoostQuery::new(
|
||||
Box::new(TermQuery::new(term.clone(), IndexRecordOption::WithFreqs)),
|
||||
4.0,
|
||||
)),
|
||||
));
|
||||
|
||||
alternates.push((
|
||||
Occur::Should,
|
||||
Box::new(BoostQuery::new(
|
||||
Box::new(FuzzyTermQuery::new_prefix(term.clone(), 0, false)),
|
||||
3.0,
|
||||
)),
|
||||
));
|
||||
|
||||
if let Some(distance) = fuzzy_distance(word.chars().count()) {
|
||||
alternates.push((
|
||||
Occur::Should,
|
||||
Box::new(BoostQuery::new(
|
||||
Box::new(FuzzyTermQuery::new(term, distance, true)),
|
||||
2.0,
|
||||
)),
|
||||
));
|
||||
}
|
||||
|
||||
per_word_clauses.push((Occur::Must, Box::new(BooleanQuery::new(alternates))));
|
||||
}
|
||||
layers.push((Occur::Should, Box::new(BooleanQuery::new(per_word_clauses))));
|
||||
|
||||
if words.len() > 1 {
|
||||
let parser = QueryParser::for_index(index, vec![fields.data_word]);
|
||||
let parser = QueryParser::for_index(index, vec![fields.all_text]);
|
||||
let query_string = format!("\"{}\"~3", words.join(" "));
|
||||
if let Ok(query) = parser.parse_query(&query_string) {
|
||||
layers.push((Occur::Should, Box::new(BoostQuery::new(query, 2.0))));
|
||||
@@ -212,10 +224,10 @@ fn fuzzy_predicate_unscoped(
|
||||
}
|
||||
|
||||
{
|
||||
let parser = QueryParser::for_index(index, vec![fields.data_ngram]);
|
||||
let parser = QueryParser::for_index(index, vec![fields.all_text]);
|
||||
let query_string = words
|
||||
.iter()
|
||||
.map(|word| format!("+{}", word))
|
||||
.map(|word| format!("+{}*", word))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ");
|
||||
if let Ok(query) = parser.parse_query(&query_string) {
|
||||
|
||||
2
server
2
server
Submodule server updated: 527a053ab9...2f933e4e34
34
tantivy_todo.md
Normal file
34
tantivy_todo.md
Normal file
@@ -0,0 +1,34 @@
|
||||
1. Add explicit reindex/backfill tooling.
|
||||
Right now, only future PostTableData / PutTableData calls index rows. There should be an admin/dev command like:
|
||||
|
||||
ReindexProfile(profile_name)
|
||||
ReindexTable(profile_name, table_name)
|
||||
ReindexRow(profile_name, table_name, id)
|
||||
|
||||
This is the biggest missing piece.
|
||||
|
||||
2. Stop using relative ./tantivy_indexes.
|
||||
Both writer and reader depend on the process working directory. Make it config/env-driven, e.g.
|
||||
TANTIVY_INDEX_DIR.
|
||||
3. Add index schema/version metadata.
|
||||
If you change tokenizers/schema later, old indexes should fail with a clear “index version mismatch, reindex
|
||||
required” instead of behaving strangely.
|
||||
4. Batch index commits.
|
||||
Current code opens a writer and commits per row. Fine for dev, not great for many inserts. A long-lived writer
|
||||
task batching commits every N docs or every short interval would be more reliable and faster.
|
||||
5. Make the indexing queue durable.
|
||||
The current mpsc queue is in-memory. If the server crashes after DB insert but before indexing, search is stale.
|
||||
For serious use, store pending index jobs in Postgres, process them, mark done.
|
||||
6. Index only live rows intentionally.
|
||||
handle_add_or_update currently fetches row by id without checking deleted = false, then search filters deleted
|
||||
rows later. I’d either skip indexing deleted rows or make delete/update semantics explicit.
|
||||
7. Add typed fields for numbers/dates if you need range queries.
|
||||
Right now numbers are converted to strings. Good for text search, bad for real numeric filtering/sorting. Tantivy
|
||||
can do numeric/date fields, but JSON text fields are not enough for robust range search.
|
||||
8. Decide column-name strategy.
|
||||
Indexing lowercases raw DB JSON keys. If UI uses display names/aliases, column constraints can miss unless the
|
||||
frontend sends exactly what the index expects. I’d centralize display-name to physical-name mapping before
|
||||
search.
|
||||
9. Add delete hooks for table/profile deletion.
|
||||
When a table or profile is deleted, the matching Tantivy docs/index directory should be cleaned by code, not
|
||||
manually.
|
||||
Reference in New Issue
Block a user