From fd1d256d61694f10075841143a2491a07f281c0b Mon Sep 17 00:00:00 2001 From: Michael Howell Date: Mon, 27 Nov 2023 22:51:01 -0700 Subject: [PATCH 1/4] rustdoc-search: remove the now-redundant `validateResult` This function dates back to 9a45c9d7c6928743f9e7a7161bf564a65bfc0577 and seems to have been made obsolete when `addIntoResult` grew the ability to check the levenshtein distance matching with commit ba824ec52beb0e49b64e86837c1402a0c2d0c971. --- src/librustdoc/html/static/js/search.js | 57 ------------------------- 1 file changed, 57 deletions(-) diff --git a/src/librustdoc/html/static/js/search.js b/src/librustdoc/html/static/js/search.js index 5d348d3f17635..a61f2f5e3b688 100644 --- a/src/librustdoc/html/static/js/search.js +++ b/src/librustdoc/html/static/js/search.js @@ -1329,25 +1329,6 @@ function initSearch(rawSearchIndex) { return 0; }); - let nameSplit = null; - if (parsedQuery.elems.length === 1) { - const hasPath = typeof parsedQuery.elems[0].path === "undefined"; - nameSplit = hasPath ? null : parsedQuery.elems[0].path; - } - - for (const result of result_list) { - // this validation does not make sense when searching by types - if (result.dontValidate) { - continue; - } - const name = result.item.name.toLowerCase(), - path = result.item.path.toLowerCase(), - parent = result.item.parent; - - if (!isType && !validateResult(name, path, nameSplit, parent)) { - result.id = -1; - } - } return transformResults(result_list); } @@ -2284,44 +2265,6 @@ function initSearch(rawSearchIndex) { return ret; } - /** - * Validate performs the following boolean logic. For example: - * "File::open" will give IF A PARENT EXISTS => ("file" && "open") - * exists in (name || path || parent) OR => ("file" && "open") exists in - * (name || path ) - * - * This could be written functionally, but I wanted to minimise - * functions on stack. - * - * @param {string} name - The name of the result - * @param {string} path - The path of the result - * @param {string} keys - The keys to be used (["file", "open"]) - * @param {Object} parent - The parent of the result - * - * @return {boolean} - Whether the result is valid or not - */ - function validateResult(name, path, keys, parent, maxEditDistance) { - if (!keys || !keys.length) { - return true; - } - for (const key of keys) { - // each check is for validation so we negate the conditions and invalidate - if (!( - // check for an exact name match - name.indexOf(key) > -1 || - // then an exact path match - path.indexOf(key) > -1 || - // next if there is a parent, check for exact parent match - (parent !== undefined && parent.name !== undefined && - parent.name.toLowerCase().indexOf(key) > -1) || - // lastly check to see if the name was an editDistance match - editDistance(name, key, maxEditDistance) <= maxEditDistance)) { - return false; - } - } - return true; - } - function nextTab(direction) { const next = (searchState.currentTab + direction + 3) % searchState.focusedByTab.length; searchState.focusedByTab[searchState.currentTab] = document.activeElement; From 9a9695a05268fd6548d416e511d5222b35b62db0 Mon Sep 17 00:00:00 2001 From: Michael Howell Date: Mon, 27 Nov 2023 22:41:45 -0700 Subject: [PATCH 2/4] rustdoc-search: use set ops for ranking and filtering This commit adds ranking and quick filtering to type-based search, improving performance and having it order results based on their type signatures. Motivation ---------- If I write a query like `str -> String`, a lot of functions come up. That's to be expected, but `String::from_str` should come up on top, and it doesn't right now. This is because the sorting algorithm is based on the functions name, and doesn't consider the type signature at all. `slice::join` even comes up above it! To fix this, the sorting should take into account the function's signature, and the closer match should come up on top. Guide-level description ----------------------- When searching by type signature, types with a "closer" match will show up above types that match less precisely. Reference-level explanation --------------------------- Functions signature search works in three major phases: * A compact "fingerprint," based on the [bloom filter] technique, is used to check for matches and to estimate the distance. It sometimes has false positive matches, but it also operates on 128 bit contiguous memory and requires no backtracking, so it performs a lot better than real unification. The fingerprint represents the set of items in the type signature, but it does not represent nesting, and it ignores when the same item appears more than once. The result is rejected if any query bits are absent in the function, or if the distance is higher than the current maximum and 200 results have already been found. * The second step performs unification. This is where nesting and true bag semantics are taken into account, and it has no false positives. It uses a recursive, backtracking algorithm. The result is rejected if any query elements are absent in the function. [bloom filter]: https://en.wikipedia.org/wiki/Bloom_filter Drawbacks --------- This makes the code bigger. More than that, this design is a subtle trade-off. It makes the cases I've tested against measurably faster, but it's not clear how well this extends to other crates with potentially more functions and fewer types. The more complex things get, the more important it is to gather a good set of data to test with (this is arguably more important than the actual benchmarking ifrastructure right now). Rationale and alternatives -------------------------- Throwing a bloom filter in front makes it faster. More than that, it tries to take a tactic where the system can not only check for potential matches, but also gets an accurate distance function without needing to do unification. That way it can skip unification even on items that have the needed elems, as long as they have more items than the currently found maximum. If I didn't want to be able to cheaply do set operations on the fingerprint, a [cuckoo filter] is supposed to have better performance. But the nice bit-banging set intersection doesn't work AFAIK. I also looked into [minhashing], but since it's actually an unbiased estimate of the similarity coefficient, I'm not sure how it could be used to skip unification (I wouldn't know if the estimate was too low or too high). This function actually uses the number of distinct items as its "distance function." This should give the same results that it would have gotten from a Jaccard Distance $1-\frac{|F\cap{}Q|}{|F\cup{}Q|}$, while being cheaper to compute. This is because: * The function $F$ must be a superset of the query $Q$, so their union is just $F$ and the intersection is $Q$ and it can be reduced to $1-\frac{|Q|}{|F|}. * There are no magic thresholds. These values are only being used to compare against each other while sorting (and, if 200 results are found, to compare with the maximum match). This means we only care if one value is bigger than the other, not what it's actual value is, and since $Q$ is the same for everything, it can be safely left out, reducing the formula to $1-\frac{1}{|F|} = \frac{|F|}{|F|}-\frac{1}{|F|} = |F|-1$. And, since the values are only being compared with each other, $|F|$ is fine. Prior art --------- This is significantly different from how Hoogle does it. It doesn't account for order, and it has no special account for nesting, though `Box` is still two items, while `t` is only one. This should give the same results that it would have gotten from a Jaccard Distance $1-\frac{|A\cap{}B|}{|A\cup{}B|}$, while being cheaper to compute. Unresolved questions -------------------- `[]` and `()`, the slice/array and tuple/union operators, are ignored while building the signature for the query. This is because they match more than one thing, making them ambiguous. Unfortunately, this also makes them a performance cliff. Is this likely to be a problem? Right now, the system just stashes the type distance into the same field that levenshtein distance normally goes in. This means exact query matches show up on top (for example, if you have a function like `fn nothing(a: Nothing, b: i32)`, then searching for `nothing` will show it on top even if there's another function with `fn bar(x: Nothing)` that's technically a closer match in type signature. Future possibilities -------------------- It should be possible to adopt more sorting criteria to act as a tie breaker, which could be determined during unification. [cuckoo filter]: https://en.wikipedia.org/wiki/Cuckoo_filter [minhashing]: https://en.wikipedia.org/wiki/MinHash --- src/librustdoc/html/static/js/externs.js | 3 +- src/librustdoc/html/static/js/search.js | 225 +++++++++++++++++++---- tests/rustdoc-js/assoc-type.js | 6 +- tests/rustdoc-js/big-result.js | 39 ++++ tests/rustdoc-js/big-result.rs | 61 ++++++ tests/rustdoc-js/full-path-function.js | 4 +- tests/rustdoc-js/generics.js | 1 + tests/rustdoc-js/impl-trait.js | 2 +- tests/rustdoc-js/type-parameters.js | 19 +- 9 files changed, 304 insertions(+), 56 deletions(-) create mode 100644 tests/rustdoc-js/big-result.js create mode 100644 tests/rustdoc-js/big-result.rs diff --git a/src/librustdoc/html/static/js/externs.js b/src/librustdoc/html/static/js/externs.js index 2338931a18fd2..93709e4e830ad 100644 --- a/src/librustdoc/html/static/js/externs.js +++ b/src/librustdoc/html/static/js/externs.js @@ -14,7 +14,7 @@ function initSearch(searchIndex){} * pathWithoutLast: Array, * pathLast: string, * generics: Array, - * bindings: Map<(string|integer), Array>, + * bindings: Map>, * }} */ let QueryElement; @@ -42,6 +42,7 @@ let ParserState; * totalElems: number, * literalSearch: boolean, * corrections: Array<{from: string, to: integer}>, + * typeFingerprint: Uint32Array, * }} */ let ParsedQuery; diff --git a/src/librustdoc/html/static/js/search.js b/src/librustdoc/html/static/js/search.js index a61f2f5e3b688..dc47e093ecd75 100644 --- a/src/librustdoc/html/static/js/search.js +++ b/src/librustdoc/html/static/js/search.js @@ -238,6 +238,10 @@ function initSearch(rawSearchIndex) { * @type {Array} */ let searchIndex; + /** + * @type {Uint32Array} + */ + let functionTypeFingerprint; let currentResults; /** * Map from normalized type names to integers. Used to make type search @@ -1038,6 +1042,8 @@ function initSearch(rawSearchIndex) { correction: null, proposeCorrectionFrom: null, proposeCorrectionTo: null, + // bloom filter build from type ids + typeFingerprint: new Uint32Array(4), }; } @@ -1133,7 +1139,6 @@ function initSearch(rawSearchIndex) { query.error = err; return query; } - if (!query.literalSearch) { // If there is more than one element in the query, we switch to literalSearch in any // case. @@ -1941,8 +1946,7 @@ function initSearch(rawSearchIndex) { * @param {integer} path_dist */ function addIntoResults(results, fullId, id, index, dist, path_dist, maxEditDistance) { - const inBounds = dist <= maxEditDistance || index !== -1; - if (dist === 0 || (!parsedQuery.literalSearch && inBounds)) { + if (dist <= maxEditDistance || index !== -1) { if (results.has(fullId)) { const result = results.get(fullId); if (result.dontValidate || result.dist <= dist) { @@ -1990,17 +1994,37 @@ function initSearch(rawSearchIndex) { const fullId = row.id; const searchWord = searchWords[pos]; - const in_args = row.type && row.type.inputs - && checkIfInList(row.type.inputs, elem, row.type.where_clause); - if (in_args) { - // path_dist is 0 because no parent path information is currently stored - // in the search index - addIntoResults(results_in_args, fullId, pos, -1, 0, 0, maxEditDistance); - } - const returned = row.type && row.type.output - && checkIfInList(row.type.output, elem, row.type.where_clause); - if (returned) { - addIntoResults(results_returned, fullId, pos, -1, 0, 0, maxEditDistance); + // fpDist is a minimum possible type distance, where "type distance" is the number of + // atoms in the function not present in the query + const tfpDist = compareTypeFingerprints( + fullId, + parsedQuery.typeFingerprint + ); + if (tfpDist !== null && + !(results_in_args.size >= MAX_RESULTS && tfpDist > results_in_args.max_dist) + ) { + const in_args = row.type && row.type.inputs + && checkIfInList(row.type.inputs, elem, row.type.where_clause); + if (in_args) { + results_in_args.max_dist = Math.max(results_in_args.max_dist || 0, tfpDist); + const maxDist = results_in_args.size < MAX_RESULTS ? + (tfpDist + 1) : + results_in_args.max_dist; + addIntoResults(results_in_args, fullId, pos, -1, tfpDist, 0, maxDist); + } + } + if (tfpDist !== false && + !(results_returned.size >= MAX_RESULTS && tfpDist > results_returned.max_dist) + ) { + const returned = row.type && row.type.output + && checkIfInList(row.type.output, elem, row.type.where_clause); + if (returned) { + results_returned.max_dist = Math.max(results_returned.max_dist || 0, tfpDist); + const maxDist = results_returned.size < MAX_RESULTS ? + (tfpDist + 1) : + results_returned.max_dist; + addIntoResults(results_returned, fullId, pos, -1, tfpDist, 0, maxDist); + } } if (!typePassesFilter(elem.typeFilter, row.ty)) { @@ -2059,6 +2083,17 @@ function initSearch(rawSearchIndex) { return; } + const tfpDist = compareTypeFingerprints( + row.id, + parsedQuery.typeFingerprint + ); + if (tfpDist === null) { + return; + } + if (results.size >= MAX_RESULTS && tfpDist > results.max_dist) { + return; + } + // If the result is too "bad", we return false and it ends this search. if (!unifyFunctionTypes( row.type.inputs, @@ -2077,7 +2112,8 @@ function initSearch(rawSearchIndex) { return; } - addIntoResults(results, row.id, pos, 0, 0, 0, Number.MAX_VALUE); + results.max_dist = Math.max(results.max_dist || 0, tfpDist); + addIntoResults(results, row.id, pos, 0, tfpDist, 0, Number.MAX_VALUE); } function innerRunQuery() { @@ -2197,14 +2233,17 @@ function initSearch(rawSearchIndex) { ); } + const fps = new Set(); for (const elem of parsedQuery.elems) { convertNameToId(elem); + buildFunctionTypeFingerprint(elem, parsedQuery.typeFingerprint, fps); } for (const elem of parsedQuery.returned) { convertNameToId(elem); + buildFunctionTypeFingerprint(elem, parsedQuery.typeFingerprint, fps); } - if (parsedQuery.foundElems === 1) { + if (parsedQuery.foundElems === 1 && parsedQuery.returned.length === 0) { if (parsedQuery.elems.length === 1) { const elem = parsedQuery.elems[0]; for (let i = 0, nSearchWords = searchWords.length; i < nSearchWords; ++i) { @@ -2220,26 +2259,6 @@ function initSearch(rawSearchIndex) { maxEditDistance ); } - } else if (parsedQuery.returned.length === 1) { - // We received one returned argument to check, so looking into returned values. - for (let i = 0, nSearchWords = searchWords.length; i < nSearchWords; ++i) { - const row = searchIndex[i]; - const in_returned = row.type && unifyFunctionTypes( - row.type.output, - parsedQuery.returned, - row.type.where_clause - ); - if (in_returned) { - addIntoResults( - results_others, - row.id, - i, - -1, - 0, - Number.MAX_VALUE - ); - } - } } } else if (parsedQuery.foundElems > 0) { for (let i = 0, nSearchWords = searchWords.length; i < nSearchWords; ++i) { @@ -2783,6 +2802,97 @@ ${item.displayPath}${name}\ }; } + /** + * Type fingerprints allow fast, approximate matching of types. + * + * This algo creates a compact representation of the type set using a Bloom filter. + * This fingerprint is used three ways: + * + * - It accelerates the matching algorithm by checking the function fingerprint against the + * query fingerprint. If any bits are set in the query but not in the function, it can't + * match. + * + * - The fourth section has the number of distinct items in the set. + * This is the distance function, used for filtering and for sorting. + * + * [^1]: Distance is the relatively naive metric of counting the number of distinct items in + * the function that are not present in the query. + * + * @param {FunctionType|QueryElement} type - a single type + * @param {Uint32Array} output - write the fingerprint to this data structure: uses 128 bits + * @param {Set} fps - Set of distinct items + */ + function buildFunctionTypeFingerprint(type, output, fps) { + + let input = type.id; + // All forms of `[]` get collapsed down to one thing in the bloom filter. + // Differentiating between arrays and slices, if the user asks for it, is + // still done in the matching algorithm. + if (input === typeNameIdOfArray || input === typeNameIdOfSlice) { + input = typeNameIdOfArrayOrSlice; + } + if (input !== null) { + // https://docs.rs/rustc-hash/1.1.0/src/rustc_hash/lib.rs.html#60 + // Rotate is skipped because we're only doing one cycle anyway. + const h0 = Math.imul(input, 0x9e3779b9); + const h1 = Math.imul(479001599 ^ input, 0x9e3779b9); + const h2 = Math.imul(433494437 ^ input, 0x9e3779b9); + output[0] |= 1 << (h0 % 32); + output[1] |= 1 << (h1 % 32); + output[2] |= 1 << (h2 % 32); + fps.add(input); + } + for (const g of type.generics) { + buildFunctionTypeFingerprint(g, output, fps); + } + const fb = { + id: null, + ty: 0, + generics: [], + bindings: new Map(), + }; + for (const [k, v] of type.bindings.entries()) { + fb.id = k; + fb.generics = v; + buildFunctionTypeFingerprint(fb, output, fps); + } + output[3] = fps.size; + } + + /** + * Compare the query fingerprint with the function fingerprint. + * + * @param {{number}} fullId - The function + * @param {{Uint32Array}} queryFingerprint - The query + * @returns {number|null} - Null if non-match, number if distance + * This function might return 0! + */ + function compareTypeFingerprints(fullId, queryFingerprint) { + + const fh0 = functionTypeFingerprint[fullId * 4]; + const fh1 = functionTypeFingerprint[(fullId * 4) + 1]; + const fh2 = functionTypeFingerprint[(fullId * 4) + 2]; + const [qh0, qh1, qh2] = queryFingerprint; + // Approximate set intersection with bloom filters. + // This can be larger than reality, not smaller, because hashes have + // the property that if they've got the same value, they hash to the + // same thing. False positives exist, but not false negatives. + const [in0, in1, in2] = [fh0 & qh0, fh1 & qh1, fh2 & qh2]; + // Approximate the set of items in the query but not the function. + // This might be smaller than reality, but cannot be bigger. + // + // | in_ | qh_ | XOR | Meaning | + // | --- | --- | --- | ------------------------------------------------ | + // | 0 | 0 | 0 | Not present | + // | 1 | 0 | 1 | IMPOSSIBLE because `in_` is `fh_ & qh_` | + // | 1 | 1 | 0 | If one or both is false positive, false negative | + // | 0 | 1 | 1 | Since in_ has no false negatives, must be real | + if ((in0 ^ qh0) || (in1 ^ qh1) || (in2 ^ qh2)) { + return null; + } + return functionTypeFingerprint[(fullId * 4) + 3]; + } + function buildIndex(rawSearchIndex) { searchIndex = []; /** @@ -2802,6 +2912,22 @@ ${item.displayPath}${name}\ typeNameIdOfSlice = buildTypeMapIndex("slice"); typeNameIdOfArrayOrSlice = buildTypeMapIndex("[]"); + // Function type fingerprints are 128-bit bloom filters that are used to + // estimate the distance between function and query. + // This loop counts the number of items to allocate a fingerprint for. + for (const crate in rawSearchIndex) { + if (!hasOwnPropertyRustdoc(rawSearchIndex, crate)) { + continue; + } + // Each item gets an entry in the fingerprint array, and the crate + // does, too + id += rawSearchIndex[crate].t.length + 1; + } + functionTypeFingerprint = new Uint32Array((id + 1) * 4); + + // This loop actually generates the search item indexes, including + // normalized names, type signature objects and fingerprints, and aliases. + id = 0; for (const crate in rawSearchIndex) { if (!hasOwnPropertyRustdoc(rawSearchIndex, crate)) { continue; @@ -2951,6 +3077,28 @@ ${item.displayPath}${name}\ } searchWords.push(word); const path = itemPaths.has(i) ? itemPaths.get(i) : lastPath; + let type = null; + if (itemFunctionSearchTypes[i] !== 0) { + type = buildFunctionSearchType( + itemFunctionSearchTypes[i], + lowercasePaths + ); + if (type) { + const fp = functionTypeFingerprint.subarray(id * 4, (id + 1) * 4); + const fps = new Set(); + for (const t of type.inputs) { + buildFunctionTypeFingerprint(t, fp, fps); + } + for (const t of type.output) { + buildFunctionTypeFingerprint(t, fp, fps); + } + for (const w of type.where_clause) { + for (const t of w) { + buildFunctionTypeFingerprint(t, fp, fps); + } + } + } + } const row = { crate: crate, ty: itemTypes.charCodeAt(i) - charA, @@ -2958,10 +3106,7 @@ ${item.displayPath}${name}\ path: path, desc: itemDescs[i], parent: itemParentIdxs[i] > 0 ? paths[itemParentIdxs[i] - 1] : undefined, - type: buildFunctionSearchType( - itemFunctionSearchTypes[i], - lowercasePaths - ), + type, id: id, normalizedName: word.indexOf("_") === -1 ? word : word.replace(/_/g, ""), deprecated: deprecatedItems.has(i), diff --git a/tests/rustdoc-js/assoc-type.js b/tests/rustdoc-js/assoc-type.js index 47776656e32c2..eec4e7a8258fb 100644 --- a/tests/rustdoc-js/assoc-type.js +++ b/tests/rustdoc-js/assoc-type.js @@ -7,16 +7,16 @@ const EXPECTED = [ 'query': 'iterator -> u32', 'correction': null, 'others': [ - { 'path': 'assoc_type', 'name': 'my_fn' }, { 'path': 'assoc_type::my', 'name': 'other_fn' }, + { 'path': 'assoc_type', 'name': 'my_fn' }, ], }, { 'query': 'iterator', 'correction': null, 'in_args': [ - { 'path': 'assoc_type', 'name': 'my_fn' }, { 'path': 'assoc_type::my', 'name': 'other_fn' }, + { 'path': 'assoc_type', 'name': 'my_fn' }, ], }, { @@ -26,8 +26,8 @@ const EXPECTED = [ { 'path': 'assoc_type', 'name': 'Something' }, ], 'in_args': [ - { 'path': 'assoc_type', 'name': 'my_fn' }, { 'path': 'assoc_type::my', 'name': 'other_fn' }, + { 'path': 'assoc_type', 'name': 'my_fn' }, ], }, // if I write an explicit binding, only it shows up diff --git a/tests/rustdoc-js/big-result.js b/tests/rustdoc-js/big-result.js new file mode 100644 index 0000000000000..07961d196f47d --- /dev/null +++ b/tests/rustdoc-js/big-result.js @@ -0,0 +1,39 @@ +// exact-check + +const EXPECTED = [ + { + 'query': 'First', + 'in_args': (function() { + // Generate the list of 200 items that should match. + const results = []; + function generate(lx, ly) { + for (const x of lx) { + for (const y of ly) { + results.push({ + 'path': `big_result::${y}`, + 'name': x, + }); + } + } + } + // Fewest parameters that still match go on top. + generate( + ['u', 'v', 'w', 'x', 'y'], + ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] + ); + generate( + ['p', 'q', 'r', 's', 't'], + ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] + ); + generate( + ['k', 'l', 'm', 'n', 'o'], + ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] + ); + generate( + ['f', 'g', 'h', 'i', 'j'], + ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] + ); + return results; + })(), + }, +]; diff --git a/tests/rustdoc-js/big-result.rs b/tests/rustdoc-js/big-result.rs new file mode 100644 index 0000000000000..4dfecd6aaadd5 --- /dev/null +++ b/tests/rustdoc-js/big-result.rs @@ -0,0 +1,61 @@ +#![feature(concat_idents)] +#![allow(nonstandard_style)] +/// Generate 250 items that all match the query, starting with the longest. +/// Those long items should be dropped from the result set, and the short ones +/// should be shown instead. +macro_rules! generate { + ([$($x:ident),+], $y:tt, $z:tt) => { + $( + generate!(@ $x, $y, $z); + )+ + }; + (@ $x:ident , [$($y:ident),+], $z:tt) => { + pub struct $x; + $( + generate!(@@ $x, $y, $z); + )+ + }; + (@@ $x:ident , $y:ident, [$($z:ident: $zt:ident),+]) => { + impl $y { + pub fn $x($($z: $zt,)+) {} + } + } +} + +pub struct First; +pub struct Second; +pub struct Third; +pub struct Fourth; +pub struct Fifth; + +generate!( + [a, b, c, d, e], + [a, b, c, d, e, f, g, h, i, j], + [a: First, b: Second, c: Third, d: Fourth, e: Fifth] +); + +generate!( + [f, g, h, i, j], + [a, b, c, d, e, f, g, h, i, j], + [a: First, b: Second, c: Third, d: Fourth] +); + +generate!( + [k, l, m, n, o], + [a, b, c, d, e, f, g, h, i, j], + [a: First, b: Second, c: Third] +); + +generate!( + // reverse it, just to make sure they're alphabetized + // in the result set when all else is equal + [t, s, r, q, p], + [a, b, c, d, e, f, g, h, i, j], + [a: First, b: Second] +); + +generate!( + [u, v, w, x, y], + [a, b, c, d, e, f, g, h, i, j], + [a: First] +); diff --git a/tests/rustdoc-js/full-path-function.js b/tests/rustdoc-js/full-path-function.js index 48be51b156fde..0464f7922174d 100644 --- a/tests/rustdoc-js/full-path-function.js +++ b/tests/rustdoc-js/full-path-function.js @@ -4,16 +4,16 @@ const EXPECTED = [ { 'query': 'sac -> usize', 'others': [ - { 'path': 'full_path_function::b::Sac', 'name': 'bar' }, { 'path': 'full_path_function::b::Sac', 'name': 'len' }, { 'path': 'full_path_function::sac::Sac', 'name': 'len' }, + { 'path': 'full_path_function::b::Sac', 'name': 'bar' }, ], }, { 'query': 'b::sac -> usize', 'others': [ - { 'path': 'full_path_function::b::Sac', 'name': 'bar' }, { 'path': 'full_path_function::b::Sac', 'name': 'len' }, + { 'path': 'full_path_function::b::Sac', 'name': 'bar' }, ], }, { diff --git a/tests/rustdoc-js/generics.js b/tests/rustdoc-js/generics.js index ebc92ccfc0575..b3ca0af3056a5 100644 --- a/tests/rustdoc-js/generics.js +++ b/tests/rustdoc-js/generics.js @@ -1,4 +1,5 @@ // exact-check +// ignore-order const EXPECTED = [ { diff --git a/tests/rustdoc-js/impl-trait.js b/tests/rustdoc-js/impl-trait.js index 00d67d639bd08..8bb3f2d3e99a5 100644 --- a/tests/rustdoc-js/impl-trait.js +++ b/tests/rustdoc-js/impl-trait.js @@ -39,8 +39,8 @@ const EXPECTED = [ { 'path': 'impl_trait', 'name': 'Aaaaaaa' }, ], 'in_args': [ - { 'path': 'impl_trait::Ccccccc', 'name': 'eeeeeee' }, { 'path': 'impl_trait::Ccccccc', 'name': 'fffffff' }, + { 'path': 'impl_trait::Ccccccc', 'name': 'eeeeeee' }, ], 'returned': [ { 'path': 'impl_trait', 'name': 'bbbbbbb' }, diff --git a/tests/rustdoc-js/type-parameters.js b/tests/rustdoc-js/type-parameters.js index e695f189bb672..e045409e507e5 100644 --- a/tests/rustdoc-js/type-parameters.js +++ b/tests/rustdoc-js/type-parameters.js @@ -1,20 +1,19 @@ // exact-check -// ignore-order const EXPECTED = [ { query: '-> trait:Some', others: [ - { path: 'foo', name: 'alef' }, { path: 'foo', name: 'alpha' }, + { path: 'foo', name: 'alef' }, ], }, { query: '-> generic:T', others: [ + { path: 'foo', name: 'beta' }, { path: 'foo', name: 'bet' }, { path: 'foo', name: 'alef' }, - { path: 'foo', name: 'beta' }, ], }, { @@ -44,38 +43,40 @@ const EXPECTED = [ { query: 'Other, Other', others: [ - { path: 'foo', name: 'other' }, { path: 'foo', name: 'alternate' }, + { path: 'foo', name: 'other' }, ], }, { query: 'generic:T', in_args: [ - { path: 'foo', name: 'bet' }, { path: 'foo', name: 'beta' }, - { path: 'foo', name: 'other' }, + { path: 'foo', name: 'bet' }, { path: 'foo', name: 'alternate' }, + { path: 'foo', name: 'other' }, ], }, { query: 'generic:Other', in_args: [ - { path: 'foo', name: 'bet' }, { path: 'foo', name: 'beta' }, - { path: 'foo', name: 'other' }, + { path: 'foo', name: 'bet' }, { path: 'foo', name: 'alternate' }, + { path: 'foo', name: 'other' }, ], }, { query: 'trait:Other', in_args: [ - { path: 'foo', name: 'other' }, { path: 'foo', name: 'alternate' }, + { path: 'foo', name: 'other' }, ], }, { query: 'Other', in_args: [ + // because function is called "other", it's sorted first + // even though it has higher type distance { path: 'foo', name: 'other' }, { path: 'foo', name: 'alternate' }, ], From 9dfcf131b3a8c36c5d9f8c46b3f127e5709b2b93 Mon Sep 17 00:00:00 2001 From: Michael Howell Date: Sun, 10 Dec 2023 22:46:40 -0700 Subject: [PATCH 3/4] rustdoc-search: better hashing, faster unification The hash changes are based on some tests with `arti` and various specific queries, aimed at reducing the false positive rate. Sorting the query elements so that generics always come first is instead aimed at reducing the number of Map operations on mgens, assuming if the bloom filter does find a false positive, it'll be able to reject the row without having to track a mapping. - https://hur.st/bloomfilter/?n=3&p=&m=96&k=6 Different functions have different amounts of inputs, and unification isn't very slow anyway, so figuring out a single ideal number of hash functions is nasty, but 6 keeps things low even up to 10 inputs. - https://web.archive.org/web/20210927123933/https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.72.2442&rep=rep1&type=pdf This is the `h1` and `h2`, both derived from `h0`. --- src/librustdoc/html/static/js/search.js | 56 ++++++++++++++++++++----- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/src/librustdoc/html/static/js/search.js b/src/librustdoc/html/static/js/search.js index dc47e093ecd75..a72190661043b 100644 --- a/src/librustdoc/html/static/js/search.js +++ b/src/librustdoc/html/static/js/search.js @@ -2261,6 +2261,22 @@ function initSearch(rawSearchIndex) { } } } else if (parsedQuery.foundElems > 0) { + // Sort input and output so that generic type variables go first and + // types with generic parameters go last. + // That's because of the way unification is structured: it eats off + // the end, and hits a fast path if the last item is a simple atom. + const sortQ = (a, b) => { + const ag = a.generics.length === 0 && a.bindings.size === 0; + const bg = b.generics.length === 0 && b.bindings.size === 0; + if (ag !== bg) { + return ag - bg; + } + const ai = a.id > 0; + const bi = b.id > 0; + return ai - bi; + }; + parsedQuery.elems.sort(sortQ); + parsedQuery.returned.sort(sortQ); for (let i = 0, nSearchWords = searchWords.length; i < nSearchWords; ++i) { handleArgs(searchIndex[i], i, results_others); } @@ -2823,7 +2839,6 @@ ${item.displayPath}${name}\ * @param {Set} fps - Set of distinct items */ function buildFunctionTypeFingerprint(type, output, fps) { - let input = type.id; // All forms of `[]` get collapsed down to one thing in the bloom filter. // Differentiating between arrays and slices, if the user asks for it, is @@ -2831,15 +2846,37 @@ ${item.displayPath}${name}\ if (input === typeNameIdOfArray || input === typeNameIdOfSlice) { input = typeNameIdOfArrayOrSlice; } + // http://burtleburtle.net/bob/hash/integer.html + // ~~ is toInt32. It's used before adding, so + // the number stays in safe integer range. + const hashint1 = k => { + k = (~~k + 0x7ed55d16) + (k << 12); + k = (k ^ 0xc761c23c) ^ (k >>> 19); + k = (~~k + 0x165667b1) + (k << 5); + k = (~~k + 0xd3a2646c) ^ (k << 9); + k = (~~k + 0xfd7046c5) + (k << 3); + return (k ^ 0xb55a4f09) ^ (k >>> 16); + }; + const hashint2 = k => { + k = ~k + (k << 15); + k ^= k >>> 12; + k += k << 2; + k ^= k >>> 4; + k = Math.imul(k, 2057); + return k ^ (k >> 16); + }; if (input !== null) { - // https://docs.rs/rustc-hash/1.1.0/src/rustc_hash/lib.rs.html#60 - // Rotate is skipped because we're only doing one cycle anyway. - const h0 = Math.imul(input, 0x9e3779b9); - const h1 = Math.imul(479001599 ^ input, 0x9e3779b9); - const h2 = Math.imul(433494437 ^ input, 0x9e3779b9); - output[0] |= 1 << (h0 % 32); - output[1] |= 1 << (h1 % 32); - output[2] |= 1 << (h2 % 32); + const h0a = hashint1(input); + const h0b = hashint2(input); + // Less Hashing, Same Performance: Building a Better Bloom Filter + // doi=10.1.1.72.2442 + const h1a = ~~(h0a + Math.imul(h0b, 2)); + const h1b = ~~(h0a + Math.imul(h0b, 3)); + const h2a = ~~(h0a + Math.imul(h0b, 4)); + const h2b = ~~(h0a + Math.imul(h0b, 5)); + output[0] |= (1 << (h0a % 32)) | (1 << (h1b % 32)); + output[1] |= (1 << (h1a % 32)) | (1 << (h2b % 32)); + output[2] |= (1 << (h2a % 32)) | (1 << (h0b % 32)); fps.add(input); } for (const g of type.generics) { @@ -2868,7 +2905,6 @@ ${item.displayPath}${name}\ * This function might return 0! */ function compareTypeFingerprints(fullId, queryFingerprint) { - const fh0 = functionTypeFingerprint[fullId * 4]; const fh1 = functionTypeFingerprint[(fullId * 4) + 1]; const fh2 = functionTypeFingerprint[(fullId * 4) + 2]; From bec66729849a1a62505170054df7b107344220cd Mon Sep 17 00:00:00 2001 From: Michael Howell Date: Tue, 12 Dec 2023 12:26:00 -0700 Subject: [PATCH 4/4] rustdoc-search: clean up handleSingleArg type handling --- src/librustdoc/html/static/js/search.js | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/librustdoc/html/static/js/search.js b/src/librustdoc/html/static/js/search.js index a72190661043b..6fce7650b4c14 100644 --- a/src/librustdoc/html/static/js/search.js +++ b/src/librustdoc/html/static/js/search.js @@ -2000,11 +2000,11 @@ function initSearch(rawSearchIndex) { fullId, parsedQuery.typeFingerprint ); - if (tfpDist !== null && - !(results_in_args.size >= MAX_RESULTS && tfpDist > results_in_args.max_dist) - ) { + if (tfpDist !== null) { const in_args = row.type && row.type.inputs && checkIfInList(row.type.inputs, elem, row.type.where_clause); + const returned = row.type && row.type.output + && checkIfInList(row.type.output, elem, row.type.where_clause); if (in_args) { results_in_args.max_dist = Math.max(results_in_args.max_dist || 0, tfpDist); const maxDist = results_in_args.size < MAX_RESULTS ? @@ -2012,12 +2012,6 @@ function initSearch(rawSearchIndex) { results_in_args.max_dist; addIntoResults(results_in_args, fullId, pos, -1, tfpDist, 0, maxDist); } - } - if (tfpDist !== false && - !(results_returned.size >= MAX_RESULTS && tfpDist > results_returned.max_dist) - ) { - const returned = row.type && row.type.output - && checkIfInList(row.type.output, elem, row.type.where_clause); if (returned) { results_returned.max_dist = Math.max(results_returned.max_dist || 0, tfpDist); const maxDist = results_returned.size < MAX_RESULTS ?