better search but still has some flaws. It at least works, even tho its not perfect. Needs more testing, but im pretty happy with it rn, keeping it this way
This commit is contained in:
@@ -1,18 +1,20 @@
|
|||||||
// src/lib.rs
|
// search/src/lib.rs
|
||||||
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use tantivy::collector::TopDocs;
|
use tantivy::collector::TopDocs;
|
||||||
use tantivy::query::{BooleanQuery, Occur, Query, TermQuery};
|
use tantivy::query::{
|
||||||
|
BooleanQuery, BoostQuery, FuzzyTermQuery, Occur, Query, QueryParser,
|
||||||
|
TermQuery,
|
||||||
|
};
|
||||||
use tantivy::schema::IndexRecordOption;
|
use tantivy::schema::IndexRecordOption;
|
||||||
use tantivy::tokenizer::Tokenizer;
|
|
||||||
use tantivy::{Index, TantivyDocument, Term};
|
use tantivy::{Index, TantivyDocument, Term};
|
||||||
use tonic::{Request, Response, Status};
|
use tonic::{Request, Response, Status};
|
||||||
|
|
||||||
use common::proto::multieko2::search::{
|
use common::proto::multieko2::search::{
|
||||||
search_response::Hit, SearchRequest, SearchResponse,
|
search_response::Hit, SearchRequest, SearchResponse,
|
||||||
};
|
};
|
||||||
use common::proto::multieko2::search::searcher_server::Searcher;
|
|
||||||
pub use common::proto::multieko2::search::searcher_server::SearcherServer;
|
pub use common::proto::multieko2::search::searcher_server::SearcherServer;
|
||||||
|
use common::proto::multieko2::search::searcher_server::Searcher;
|
||||||
use tantivy::schema::Value;
|
use tantivy::schema::Value;
|
||||||
|
|
||||||
pub struct SearcherService;
|
pub struct SearcherService;
|
||||||
@@ -79,8 +81,8 @@ impl Searcher for SearcherService {
|
|||||||
let index = Index::open_in_dir(&index_path)
|
let index = Index::open_in_dir(&index_path)
|
||||||
.map_err(|e| Status::internal(format!("Failed to open index: {}", e)))?;
|
.map_err(|e| Status::internal(format!("Failed to open index: {}", e)))?;
|
||||||
|
|
||||||
register_slovak_tokenizer(&index).map_err(|e| {
|
register_slovak_tokenizers(&index).map_err(|e| {
|
||||||
Status::internal(format!("Failed to register Slovak tokenizer: {}", e))
|
Status::internal(format!("Failed to register Slovak tokenizers: {}", e))
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let reader = index.reader().map_err(|e| {
|
let reader = index.reader().map_err(|e| {
|
||||||
@@ -89,49 +91,109 @@ impl Searcher for SearcherService {
|
|||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
let schema = index.schema();
|
let schema = index.schema();
|
||||||
|
|
||||||
let text_sk_field = schema.get_field("text_sk").map_err(|_| {
|
let prefix_edge_field = schema.get_field("prefix_edge").map_err(|_| {
|
||||||
Status::internal("Schema is missing the 'text_sk' field.")
|
Status::internal("Schema is missing the 'prefix_edge' field.")
|
||||||
|
})?;
|
||||||
|
let prefix_full_field = schema.get_field("prefix_full").map_err(|_| {
|
||||||
|
Status::internal("Schema is missing the 'prefix_full' field.")
|
||||||
|
})?;
|
||||||
|
let text_ngram_field = schema.get_field("text_ngram").map_err(|_| {
|
||||||
|
Status::internal("Schema is missing the 'text_ngram' field.")
|
||||||
})?;
|
})?;
|
||||||
let pg_id_field = schema.get_field("pg_id").map_err(|_| {
|
let pg_id_field = schema.get_field("pg_id").map_err(|_| {
|
||||||
Status::internal("Schema is missing the 'pg_id' field.")
|
Status::internal("Schema is missing the 'pg_id' field.")
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
// --- FINAL, ROBUST QUERY LOGIC ---
|
|
||||||
|
|
||||||
// 1. Get the exact tokenizer used for indexing the target field.
|
|
||||||
let mut tokenizer = index
|
|
||||||
.tokenizer_for_field(text_sk_field)
|
|
||||||
.map_err(|e| Status::internal(format!("Tokenizer not found: {}", e)))?;
|
|
||||||
|
|
||||||
// 2. Manually tokenize the user's normalized query string.
|
|
||||||
// CORRECTED: Store the normalized string in a variable to extend its lifetime.
|
|
||||||
let normalized_query = normalize_slovak_text(&query_str);
|
let normalized_query = normalize_slovak_text(&query_str);
|
||||||
let mut token_stream = tokenizer.token_stream(&normalized_query);
|
let words: Vec<&str> = normalized_query.split_whitespace().collect();
|
||||||
|
|
||||||
let mut terms = Vec::new();
|
if words.is_empty() {
|
||||||
while let Some(token) = token_stream.next() {
|
|
||||||
terms.push(Term::from_field_text(text_sk_field, &token.text));
|
|
||||||
}
|
|
||||||
|
|
||||||
if terms.is_empty() {
|
|
||||||
return Ok(Response::new(SearchResponse { hits: vec![] }));
|
return Ok(Response::new(SearchResponse { hits: vec![] }));
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. Create a TermQuery for each token.
|
let mut query_layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
||||||
let term_queries: Vec<(Occur, Box<dyn Query>)> = terms
|
|
||||||
.into_iter()
|
|
||||||
.map(|term| {
|
|
||||||
let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs);
|
|
||||||
(Occur::Must, Box::new(term_query) as Box<dyn Query>)
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
// 4. Combine them into a BooleanQuery.
|
// ===============================
|
||||||
let final_query = BooleanQuery::new(term_queries);
|
// LAYER 1: PREFIX MATCHING (HIGHEST PRIORITY, Boost: 4.0)
|
||||||
// --- END OF LOGIC ---
|
// ===============================
|
||||||
|
{
|
||||||
|
let mut must_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
||||||
|
for word in &words {
|
||||||
|
let edge_term =
|
||||||
|
Term::from_field_text(prefix_edge_field, word);
|
||||||
|
let full_term =
|
||||||
|
Term::from_field_text(prefix_full_field, word);
|
||||||
|
|
||||||
|
let per_word_query = BooleanQuery::new(vec![
|
||||||
|
(
|
||||||
|
Occur::Should,
|
||||||
|
Box::new(TermQuery::new(
|
||||||
|
edge_term,
|
||||||
|
IndexRecordOption::Basic,
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Occur::Should,
|
||||||
|
Box::new(TermQuery::new(
|
||||||
|
full_term,
|
||||||
|
IndexRecordOption::Basic,
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
must_clauses.push((Occur::Must, Box::new(per_word_query) as Box<dyn Query>));
|
||||||
|
}
|
||||||
|
|
||||||
|
if !must_clauses.is_empty() {
|
||||||
|
let prefix_query = BooleanQuery::new(must_clauses);
|
||||||
|
let boosted_query =
|
||||||
|
BoostQuery::new(Box::new(prefix_query), 4.0);
|
||||||
|
query_layers.push((Occur::Should, Box::new(boosted_query)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===============================
|
||||||
|
// LAYER 2: FUZZY MATCHING (HIGH PRIORITY, Boost: 3.0)
|
||||||
|
// ===============================
|
||||||
|
{
|
||||||
|
let last_word = words.last().unwrap();
|
||||||
|
let fuzzy_term =
|
||||||
|
Term::from_field_text(prefix_full_field, last_word);
|
||||||
|
let fuzzy_query = FuzzyTermQuery::new(fuzzy_term, 2, true);
|
||||||
|
let boosted_query = BoostQuery::new(Box::new(fuzzy_query), 3.0);
|
||||||
|
query_layers.push((Occur::Should, Box::new(boosted_query)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===============================
|
||||||
|
// LAYER 3: PHRASE MATCHING WITH SLOP (MEDIUM PRIORITY, Boost: 2.0)
|
||||||
|
// ===============================
|
||||||
|
if words.len() > 1 {
|
||||||
|
let slop_parser =
|
||||||
|
QueryParser::for_index(&index, vec![prefix_full_field]);
|
||||||
|
let slop_query_str = format!("\"{}\"~3", normalized_query);
|
||||||
|
if let Ok(slop_query) = slop_parser.parse_query(&slop_query_str) {
|
||||||
|
let boosted_query = BoostQuery::new(slop_query, 2.0);
|
||||||
|
query_layers.push((Occur::Should, Box::new(boosted_query)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===============================
|
||||||
|
// LAYER 4: NGRAM SUBSTRING MATCHING (LOWEST PRIORITY, Boost: 1.0)
|
||||||
|
// ===============================
|
||||||
|
{
|
||||||
|
let ngram_parser =
|
||||||
|
QueryParser::for_index(&index, vec![text_ngram_field]);
|
||||||
|
if let Ok(ngram_query) =
|
||||||
|
ngram_parser.parse_query(&normalized_query)
|
||||||
|
{
|
||||||
|
let boosted_query = BoostQuery::new(ngram_query, 1.0);
|
||||||
|
query_layers.push((Occur::Should, Box::new(boosted_query)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let master_query = BooleanQuery::new(query_layers);
|
||||||
|
|
||||||
let top_docs = searcher
|
let top_docs = searcher
|
||||||
.search(&final_query, &TopDocs::with_limit(100))
|
.search(&master_query, &TopDocs::with_limit(100))
|
||||||
.map_err(|e| Status::internal(format!("Search failed: {}", e)))?;
|
.map_err(|e| Status::internal(format!("Search failed: {}", e)))?;
|
||||||
|
|
||||||
let mut hits = Vec::new();
|
let mut hits = Vec::new();
|
||||||
@@ -142,7 +204,10 @@ impl Searcher for SearcherService {
|
|||||||
|
|
||||||
if let Some(pg_id_value) = doc.get_first(pg_id_field) {
|
if let Some(pg_id_value) = doc.get_first(pg_id_field) {
|
||||||
if let Some(pg_id) = pg_id_value.as_u64() {
|
if let Some(pg_id) = pg_id_value.as_u64() {
|
||||||
hits.push(Hit { id: pg_id as i64, score });
|
hits.push(Hit {
|
||||||
|
id: pg_id as i64,
|
||||||
|
score,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -152,22 +217,40 @@ impl Searcher for SearcherService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Registers the Slovak ngram tokenizer
|
/// This function is now an exact mirror of the one in `server/src/search_schema.rs`
|
||||||
fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
|
fn register_slovak_tokenizers(index: &Index) -> tantivy::Result<()> {
|
||||||
use tantivy::tokenizer::*;
|
use tantivy::tokenizer::*;
|
||||||
|
|
||||||
let tokenizer_manager = index.tokenizers();
|
let tokenizer_manager = index.tokenizers();
|
||||||
|
|
||||||
if tokenizer_manager.get("slovak").is_none() { // CHANGED BACK TO "slovak"
|
// TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars)
|
||||||
let slovak_ngram_tokenizer = TextAnalyzer::builder(
|
if tokenizer_manager.get("slovak_prefix_edge").is_none() {
|
||||||
NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false
|
let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(1, 15, true)?)
|
||||||
)
|
.filter(RemoveLongFilter::limit(40))
|
||||||
.filter(RemoveLongFilter::limit(40))
|
.filter(LowerCaser)
|
||||||
.filter(LowerCaser)
|
.filter(AsciiFoldingFilter)
|
||||||
.filter(AsciiFoldingFilter)
|
.build();
|
||||||
.build();
|
tokenizer_manager.register("slovak_prefix_edge", tokenizer);
|
||||||
|
}
|
||||||
|
|
||||||
tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // CHANGED BACK TO "slovak"
|
// TOKENIZER for `prefix_full`: Simple word tokenizer
|
||||||
|
if tokenizer_manager.get("slovak_prefix_full").is_none() {
|
||||||
|
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||||
|
.filter(RemoveLongFilter::limit(40))
|
||||||
|
.filter(LowerCaser)
|
||||||
|
.filter(AsciiFoldingFilter)
|
||||||
|
.build();
|
||||||
|
tokenizer_manager.register("slovak_prefix_full", tokenizer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// NGRAM TOKENIZER: For substring matching.
|
||||||
|
if tokenizer_manager.get("slovak_ngram").is_none() {
|
||||||
|
let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?)
|
||||||
|
.filter(RemoveLongFilter::limit(40))
|
||||||
|
.filter(LowerCaser)
|
||||||
|
.filter(AsciiFoldingFilter)
|
||||||
|
.build();
|
||||||
|
tokenizer_manager.register("slovak_ngram", tokenizer);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
// src/indexer.rs
|
// server/src/indexer.rs
|
||||||
|
|
||||||
use std::path::Path;
|
|
||||||
use sqlx::{PgPool, Row};
|
use sqlx::{PgPool, Row};
|
||||||
use tantivy::schema::{Schema, Term};
|
use tantivy::schema::Term;
|
||||||
use tantivy::{doc, Index, IndexWriter};
|
use tantivy::{doc, IndexWriter};
|
||||||
use tokio::sync::mpsc::Receiver;
|
use tokio::sync::mpsc::Receiver;
|
||||||
use tracing::{error, info, warn};
|
use tracing::{error, info, warn};
|
||||||
|
use tantivy::schema::Schema;
|
||||||
use crate::search_schema;
|
use crate::search_schema;
|
||||||
|
|
||||||
const INDEX_DIR: &str = "./tantivy_indexes";
|
const INDEX_DIR: &str = "./tantivy_indexes";
|
||||||
@@ -49,44 +49,39 @@ async fn handle_add_or_update(
|
|||||||
pool: &PgPool,
|
pool: &PgPool,
|
||||||
data: IndexCommandData,
|
data: IndexCommandData,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// 1. Fetch the full row data from PostgreSQL
|
|
||||||
let qualified_table = format!("gen.\"{}\"", data.table_name);
|
let qualified_table = format!("gen.\"{}\"", data.table_name);
|
||||||
let query_str = format!(
|
let query_str = format!(
|
||||||
"SELECT to_jsonb(t) AS data FROM {} t WHERE id = $1",
|
"SELECT to_jsonb(t) AS data FROM {} t WHERE id = $1",
|
||||||
qualified_table
|
qualified_table
|
||||||
);
|
);
|
||||||
|
|
||||||
let row = sqlx::query(&query_str)
|
let row = sqlx::query(&query_str)
|
||||||
.bind(data.row_id)
|
.bind(data.row_id)
|
||||||
.fetch_one(pool)
|
.fetch_one(pool)
|
||||||
.await?;
|
.await?;
|
||||||
let json_data: serde_json::Value = row.try_get("data")?;
|
let json_data: serde_json::Value = row.try_get("data")?;
|
||||||
|
|
||||||
// 2. Extract all text content for Slovak processing
|
|
||||||
let slovak_text = extract_text_content(&json_data);
|
let slovak_text = extract_text_content(&json_data);
|
||||||
|
|
||||||
// 3. Open the index and write the document
|
|
||||||
let (mut writer, schema) = get_index_writer(&data.table_name)?;
|
let (mut writer, schema) = get_index_writer(&data.table_name)?;
|
||||||
let pg_id_field = schema.get_field("pg_id").unwrap();
|
let pg_id_field = schema.get_field("pg_id").unwrap();
|
||||||
let text_sk_field = schema.get_field("text_sk").unwrap();
|
let prefix_edge_field = schema.get_field("prefix_edge").unwrap();
|
||||||
|
let prefix_full_field = schema.get_field("prefix_full").unwrap();
|
||||||
|
let text_ngram_field = schema.get_field("text_ngram").unwrap();
|
||||||
|
|
||||||
// First, delete any existing document with this ID to handle updates
|
|
||||||
let id_term = Term::from_field_u64(pg_id_field, data.row_id as u64);
|
let id_term = Term::from_field_u64(pg_id_field, data.row_id as u64);
|
||||||
writer.delete_term(id_term);
|
writer.delete_term(id_term);
|
||||||
|
|
||||||
// Add the new document
|
|
||||||
writer.add_document(doc!(
|
writer.add_document(doc!(
|
||||||
pg_id_field => data.row_id as u64,
|
pg_id_field => data.row_id as u64,
|
||||||
text_sk_field => slovak_text
|
prefix_edge_field => slovak_text.clone(),
|
||||||
|
prefix_full_field => slovak_text.clone(),
|
||||||
|
text_ngram_field => slovak_text
|
||||||
))?;
|
))?;
|
||||||
|
|
||||||
// 4. Commit changes
|
|
||||||
writer.commit()?;
|
writer.commit()?;
|
||||||
info!(
|
info!(
|
||||||
"Successfully indexed Slovak document id:{} for table:{}",
|
"Successfully indexed document id:{} for table:{}",
|
||||||
data.row_id, data.table_name
|
data.row_id, data.table_name
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -123,7 +118,7 @@ fn get_index_writer(
|
|||||||
/// Extract all text content from a JSON object for indexing
|
/// Extract all text content from a JSON object for indexing
|
||||||
fn extract_text_content(json_data: &serde_json::Value) -> String {
|
fn extract_text_content(json_data: &serde_json::Value) -> String {
|
||||||
let mut full_text = String::new();
|
let mut full_text = String::new();
|
||||||
|
|
||||||
if let Some(obj) = json_data.as_object() {
|
if let Some(obj) = json_data.as_object() {
|
||||||
for value in obj.values() {
|
for value in obj.values() {
|
||||||
match value {
|
match value {
|
||||||
@@ -135,11 +130,10 @@ fn extract_text_content(json_data: &serde_json::Value) -> String {
|
|||||||
full_text.push_str(&n.to_string());
|
full_text.push_str(&n.to_string());
|
||||||
full_text.push(' ');
|
full_text.push(' ');
|
||||||
}
|
}
|
||||||
// We could recursively handle nested objects if needed
|
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
full_text.trim().to_string()
|
full_text.trim().to_string()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,59 +5,93 @@ use tantivy::tokenizer::*;
|
|||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
/// Creates a Tantivy schema optimized for Slovak ngram search
|
/// Creates a hybrid Slovak search schema with optimized prefix fields.
|
||||||
pub fn create_search_schema() -> Schema {
|
pub fn create_search_schema() -> Schema {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
// ID field to link back to PostgreSQL
|
|
||||||
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
|
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
|
||||||
|
|
||||||
// Slovak text field with ngram tokenizer for search-as-you-type
|
// FIELD 1: For prefixes (1-15 chars).
|
||||||
let text_field_indexing = TextFieldIndexing::default()
|
let short_prefix_indexing = TextFieldIndexing::default()
|
||||||
.set_tokenizer("slovak") // KEEP THE SAME NAME
|
.set_tokenizer("slovak_prefix_edge")
|
||||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||||
|
let short_prefix_options = TextOptions::default()
|
||||||
let text_options = TextOptions::default()
|
.set_indexing_options(short_prefix_indexing)
|
||||||
.set_indexing_options(text_field_indexing)
|
|
||||||
.set_stored();
|
.set_stored();
|
||||||
|
schema_builder.add_text_field("prefix_edge", short_prefix_options);
|
||||||
|
|
||||||
schema_builder.add_text_field("text_sk", text_options);
|
// FIELD 2: For the full word.
|
||||||
|
let full_word_indexing = TextFieldIndexing::default()
|
||||||
|
.set_tokenizer("slovak_prefix_full")
|
||||||
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||||
|
let full_word_options = TextOptions::default()
|
||||||
|
.set_indexing_options(full_word_indexing)
|
||||||
|
.set_stored();
|
||||||
|
schema_builder.add_text_field("prefix_full", full_word_options);
|
||||||
|
|
||||||
|
// NGRAM FIELD: For substring matching.
|
||||||
|
let ngram_field_indexing = TextFieldIndexing::default()
|
||||||
|
.set_tokenizer("slovak_ngram")
|
||||||
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||||
|
let ngram_options = TextOptions::default()
|
||||||
|
.set_indexing_options(ngram_field_indexing)
|
||||||
|
.set_stored();
|
||||||
|
schema_builder.add_text_field("text_ngram", ngram_options);
|
||||||
|
|
||||||
schema_builder.build()
|
schema_builder.build()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Registers the Slovak ngram tokenizer with the index
|
/// Registers all necessary Slovak tokenizers with the index.
|
||||||
pub fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
|
pub fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
|
||||||
let tokenizer_manager = index.tokenizers();
|
let tokenizer_manager = index.tokenizers();
|
||||||
|
|
||||||
// Create Slovak ngram tokenizer pipeline - BUT REGISTER AS "slovak"
|
// TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars)
|
||||||
let slovak_ngram_tokenizer = TextAnalyzer::builder(
|
if tokenizer_manager.get("slovak_prefix_edge").is_none() {
|
||||||
NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false
|
// YOUR RECOMMENDED FIX: Extend the max_gram to a more practical limit.
|
||||||
)
|
let tokenizer = TextAnalyzer::builder(
|
||||||
.filter(RemoveLongFilter::limit(40)) // Remove very long tokens
|
NgramTokenizer::new(1, 15, true)?
|
||||||
.filter(LowerCaser) // Convert to lowercase
|
)
|
||||||
.filter(AsciiFoldingFilter) // Handle diacritics: č->c, š->s, ž->z, etc.
|
.filter(RemoveLongFilter::limit(40))
|
||||||
.build();
|
.filter(LowerCaser)
|
||||||
|
.filter(AsciiFoldingFilter)
|
||||||
|
.build();
|
||||||
|
tokenizer_manager.register("slovak_prefix_edge", tokenizer);
|
||||||
|
}
|
||||||
|
|
||||||
tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // SAME NAME
|
// TOKENIZER for `prefix_full`: Simple word tokenizer
|
||||||
|
if tokenizer_manager.get("slovak_prefix_full").is_none() {
|
||||||
|
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||||
|
.filter(RemoveLongFilter::limit(40))
|
||||||
|
.filter(LowerCaser)
|
||||||
|
.filter(AsciiFoldingFilter)
|
||||||
|
.build();
|
||||||
|
tokenizer_manager.register("slovak_prefix_full", tokenizer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// NGRAM TOKENIZER: For substring matching.
|
||||||
|
if tokenizer_manager.get("slovak_ngram").is_none() {
|
||||||
|
let tokenizer = TextAnalyzer::builder(
|
||||||
|
NgramTokenizer::new(3, 3, false)?
|
||||||
|
)
|
||||||
|
.filter(RemoveLongFilter::limit(40))
|
||||||
|
.filter(LowerCaser)
|
||||||
|
.filter(AsciiFoldingFilter)
|
||||||
|
.build();
|
||||||
|
tokenizer_manager.register("slovak_ngram", tokenizer);
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gets or creates an index for a table with proper Slovak ngram processing
|
|
||||||
pub fn get_or_create_index(table_name: &str) -> tantivy::Result<Index> {
|
pub fn get_or_create_index(table_name: &str) -> tantivy::Result<Index> {
|
||||||
let index_path = Path::new("./tantivy_indexes").join(table_name);
|
let index_path = Path::new("./tantivy_indexes").join(table_name);
|
||||||
std::fs::create_dir_all(&index_path)?;
|
std::fs::create_dir_all(&index_path)?;
|
||||||
|
|
||||||
let index = if index_path.join("meta.json").exists() {
|
let index = if index_path.join("meta.json").exists() {
|
||||||
Index::open_in_dir(&index_path)?
|
Index::open_in_dir(&index_path)?
|
||||||
} else {
|
} else {
|
||||||
let schema = create_search_schema();
|
let schema = create_search_schema();
|
||||||
Index::create_in_dir(&index_path, schema)?
|
Index::create_in_dir(&index_path, schema)?
|
||||||
};
|
};
|
||||||
|
|
||||||
// Always register the tokenizer when opening
|
|
||||||
register_slovak_tokenizer(&index)?;
|
register_slovak_tokenizer(&index)?;
|
||||||
|
|
||||||
Ok(index)
|
Ok(index)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user