From 83af14caa4740fba57c6b75152bc40d77935ac04 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Wed, 25 Oct 2023 09:17:31 +0200 Subject: [PATCH] Fix range query (#2226) Fix range query end check in advance Rename vars to reduce ambiguity add tests Fixes #2225 --- src/indexer/index_writer.rs | 119 +++++++++++++----- .../range_query/fast_field_range_query.rs | 63 +++++++++- 2 files changed, 145 insertions(+), 37 deletions(-) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 4357ce04cf..2323806d12 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -1706,7 +1706,8 @@ mod tests { let old_reader = index.reader()?; - let id_exists = |id| id % 3 != 0; // 0 does not exist + // Every 3rd doc has only id field + let id_is_full_doc = |id| id % 3 != 0; let multi_text_field_text1 = "test1 test2 test3 test1 test2 test3"; // rotate left @@ -1722,7 +1723,7 @@ mod tests { let facet = Facet::from(&("/cola/".to_string() + &id.to_string())); let ip = ip_from_id(id); - if !id_exists(id) { + if !id_is_full_doc(id) { // every 3rd doc has no ip field index_writer.add_document(doc!( id_field=>id, @@ -1842,7 +1843,7 @@ mod tests { let num_docs_with_values = expected_ids_and_num_occurrences .iter() - .filter(|(id, _id_occurrences)| id_exists(**id)) + .filter(|(id, _id_occurrences)| id_is_full_doc(**id)) .map(|(_, id_occurrences)| *id_occurrences as usize) .sum::(); @@ -1866,7 +1867,7 @@ mod tests { if force_end_merge && num_segments_before_merge > 1 && num_segments_after_merge == 1 { let mut expected_multi_ips: Vec<_> = id_list .iter() - .filter(|id| id_exists(**id)) + .filter(|id| id_is_full_doc(**id)) .flat_map(|id| vec![ip_from_id(*id), ip_from_id(*id)]) .collect(); assert_eq!(num_ips, expected_multi_ips.len() as u32); @@ -1904,7 +1905,7 @@ mod tests { let expected_ips = expected_ids_and_num_occurrences .keys() .flat_map(|id| { - if !id_exists(*id) { + if !id_is_full_doc(*id) { None } else { Some(Ipv6Addr::from_u128(*id as u128)) @@ -1916,7 +1917,7 @@ mod tests { let expected_ips = expected_ids_and_num_occurrences .keys() .filter_map(|id| { - if !id_exists(*id) { + if !id_is_full_doc(*id) { None } else { Some(Ipv6Addr::from_u128(*id as u128)) @@ -1951,7 +1952,7 @@ mod tests { let id = id_reader.first(doc).unwrap(); let vals: Vec = ff_reader.values_for_doc(doc).collect(); - if id_exists(id) { + if id_is_full_doc(id) { assert_eq!(vals.len(), 2); assert_eq!(vals[0], vals[1]); assert!(expected_ids_and_num_occurrences.contains_key(&vals[0])); @@ -1961,7 +1962,7 @@ mod tests { } let bool_vals: Vec = bool_ff_reader.values_for_doc(doc).collect(); - if id_exists(id) { + if id_is_full_doc(id) { assert_eq!(bool_vals.len(), 2); assert_ne!(bool_vals[0], bool_vals[1]); } else { @@ -1990,7 +1991,7 @@ mod tests { .as_u64() .unwrap(); assert!(expected_ids_and_num_occurrences.contains_key(&id)); - if id_exists(id) { + if id_is_full_doc(id) { let id2 = store_reader .get::(doc_id) .unwrap() @@ -2037,7 +2038,7 @@ mod tests { let (existing_id, count) = (*id, *count); let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64; assert_eq!(get_num_hits(id_field), count); - if !id_exists(existing_id) { + if !id_is_full_doc(existing_id) { continue; } assert_eq!(get_num_hits(text_field), count); @@ -2087,7 +2088,7 @@ mod tests { // for (existing_id, count) in &expected_ids_and_num_occurrences { let (existing_id, count) = (*existing_id, *count); - if !id_exists(existing_id) { + if !id_is_full_doc(existing_id) { continue; } let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64; @@ -2104,34 +2105,84 @@ mod tests { } } - // assert data is like expected + // Range query // - for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) { - let (existing_id, count) = (*existing_id, *count); - if !id_exists(existing_id) { - continue; - } - let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| { - format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string()) + // Take half as sample + let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect(); + sample.sort_by_key(|(k, _num_occurences)| *k); + // sample.truncate(sample.len() / 2); + if !sample.is_empty() { + let (left_sample, right_sample) = sample.split_at(sample.len() / 2); + + let expected_count = |sample: &[(&u64, &u64)]| { + sample + .iter() + .filter(|(id, _)| id_is_full_doc(**id)) + .map(|(_id, num_occurences)| **num_occurences) + .sum::() }; - let ip = ip_from_id(existing_id); - - let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64; - // Range query on single value field - let query = gen_query_inclusive("ip", ip, ip); - assert_eq!(do_search_ip_field(&query), count); - - // Range query on multi value field - let query = gen_query_inclusive("ips", ip, ip); + fn gen_query_inclusive( + field: &str, + from: T1, + to: T2, + ) -> String { + format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string()) + } - assert_eq!(do_search_ip_field(&query), count); + // Query first half + if !left_sample.is_empty() { + let expected_count = expected_count(left_sample); + + let start_range = *left_sample[0].0; + let end_range = *left_sample.last().unwrap().0; + let query = gen_query_inclusive("id_opt", start_range, end_range); + assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count); + + // Range query on ip field + let ip1 = ip_from_id(start_range); + let ip2 = ip_from_id(end_range); + let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64; + let query = gen_query_inclusive("ip", ip1, ip2); + assert_eq!(do_search_ip_field(&query), expected_count); + let query = gen_query_inclusive("ip", "*", ip2); + assert_eq!(do_search_ip_field(&query), expected_count); + // Range query on multi value field + let query = gen_query_inclusive("ips", ip1, ip2); + assert_eq!(do_search_ip_field(&query), expected_count); + let query = gen_query_inclusive("ips", "*", ip2); + assert_eq!(do_search_ip_field(&query), expected_count); + } + // Query second half + if !right_sample.is_empty() { + let expected_count = expected_count(right_sample); + let start_range = *right_sample[0].0; + let end_range = *right_sample.last().unwrap().0; + // Range query on id opt field + let query = + gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string()); + assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count); + + // Range query on ip field + let ip1 = ip_from_id(start_range); + let ip2 = ip_from_id(end_range); + let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64; + let query = gen_query_inclusive("ip", ip1, ip2); + assert_eq!(do_search_ip_field(&query), expected_count); + let query = gen_query_inclusive("ip", ip1, "*"); + assert_eq!(do_search_ip_field(&query), expected_count); + // Range query on multi value field + let query = gen_query_inclusive("ips", ip1, ip2); + assert_eq!(do_search_ip_field(&query), expected_count); + let query = gen_query_inclusive("ips", ip1, "*"); + assert_eq!(do_search_ip_field(&query), expected_count); + } } // ip range query on fast field // for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) { let (existing_id, count) = (*existing_id, *count); - if !id_exists(existing_id) { + if !id_is_full_doc(existing_id) { continue; } let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| { @@ -2159,7 +2210,7 @@ mod tests { .first_or_default_col(9999); for doc_id in segment_reader.doc_ids_alive() { let id = ff_reader.get_val(doc_id); - if !id_exists(id) { + if !id_is_full_doc(id) { continue; } let facet_ords: Vec = facet_reader.facet_ords(doc_id).collect(); @@ -2197,6 +2248,12 @@ mod tests { Ok(index) } + #[test] + fn test_fast_field_range() { + let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect(); + assert!(test_operation_strategy(&ops, false, true).is_ok()); + } + #[test] fn test_sort_index_on_opt_field_regression() { assert!(test_operation_strategy( diff --git a/src/query/range_query/fast_field_range_query.rs b/src/query/range_query/fast_field_range_query.rs index c36a826aa8..46de0a5de4 100644 --- a/src/query/range_query/fast_field_range_query.rs +++ b/src/query/range_query/fast_field_range_query.rs @@ -31,8 +31,8 @@ impl VecCursor { self.current_pos = 0; &mut self.docs } - fn last_value(&self) -> Option { - self.docs.iter().last().cloned() + fn last_doc(&self) -> Option { + self.docs.last().cloned() } fn is_empty(&self) -> bool { self.current().is_none() @@ -112,15 +112,15 @@ impl RangeDocSet { finished_to_end = true; } - let last_value = self.loaded_docs.last_value(); + let last_doc = self.loaded_docs.last_doc(); let doc_buffer: &mut Vec = self.loaded_docs.get_cleared_data(); self.column.get_docids_for_value_range( self.value_range.clone(), self.next_fetch_start..end, doc_buffer, ); - if let Some(last_value) = last_value { - while self.loaded_docs.current() == Some(last_value) { + if let Some(last_doc) = last_doc { + while self.loaded_docs.current() == Some(last_doc) { self.loaded_docs.next(); } } @@ -136,7 +136,7 @@ impl DocSet for RangeDocSe if let Some(docid) = self.loaded_docs.next() { return docid; } - if self.next_fetch_start >= self.column.values.num_vals() { + if self.next_fetch_start >= self.column.num_docs() { return TERMINATED; } self.fetch_block(); @@ -177,3 +177,54 @@ impl DocSet for RangeDocSe 0 // heuristic possible by checking number of hits when fetching a block } } + +#[cfg(test)] +mod tests { + use crate::collector::Count; + use crate::directory::RamDirectory; + use crate::query::RangeQuery; + use crate::{schema, IndexBuilder, TantivyDocument}; + + #[test] + fn range_query_fast_optional_field_minimum() { + let mut schema_builder = schema::SchemaBuilder::new(); + let id_field = schema_builder.add_text_field("id", schema::STRING); + let score_field = schema_builder.add_u64_field("score", schema::FAST | schema::INDEXED); + + let dir = RamDirectory::default(); + let index = IndexBuilder::new() + .schema(schema_builder.build()) + .open_or_create(dir) + .unwrap(); + + { + let mut writer = index.writer(15_000_000).unwrap(); + + let count = 1000; + for i in 0..count { + let mut doc = TantivyDocument::new(); + doc.add_text(id_field, format!("doc{i}")); + + let nb_scores = i % 2; // 0 or 1 scores + for _ in 0..nb_scores { + doc.add_u64(score_field, 80); + } + + writer.add_document(doc).unwrap(); + } + writer.commit().unwrap(); + } + + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + + let query = RangeQuery::new_u64_bounds( + "score".to_string(), + std::ops::Bound::Included(70), + std::ops::Bound::Unbounded, + ); + + let count = searcher.search(&query, &Count).unwrap(); + assert_eq!(count, 500); + } +}