forgejo/modules/indexer/code/bleve/tokenizer/hierarchy/hierarchy.go

72 lines
1.9 KiB
Go
Raw Normal View History

// Copyright 2024 The Forgejo Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package hierarchy
import (
"bytes"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "path_hierarchy"
type PathHierarchyTokenizer struct{}
// Similar to elastic's path_hierarchy tokenizer
// This tokenizes a given path into all the possible hierarchies
// For example,
// modules/indexer/code/search.go =>
//
// modules/
// modules/indexer
// modules/indexer/code
// modules/indexer/code/search.go
func (t *PathHierarchyTokenizer) Tokenize(input []byte) analysis.TokenStream {
// trim any extra slashes
input = bytes.Trim(input, "/")
// zero allocations until the nested directories exceed a depth of 8 (which is unlikely)
rv := make(analysis.TokenStream, 0, 8)
count, off := 1, 0
// iterate till all directory separators
for i := bytes.IndexRune(input[off:], '/'); i != -1; i = bytes.IndexRune(input[off:], '/') {
// the index is relative to input[offset...]
// add this index to the accumulated offset to get the index of the current separator in input[0...]
off += i
rv = append(rv, &analysis.Token{
Term: input[:off], // take the slice, input[0...index of separator]
Start: 0,
End: off,
Position: count,
Type: analysis.AlphaNumeric,
})
// increment the offset after considering the separator
off++
count++
}
// the entire file path should always be the last token
rv = append(rv, &analysis.Token{
Term: input,
Start: 0,
End: len(input),
Position: count,
Type: analysis.AlphaNumeric,
})
return rv
}
func TokenizerConstructor(config map[string]any, cache *registry.Cache) (analysis.Tokenizer, error) {
return &PathHierarchyTokenizer{}, nil
}
func init() {
Update module github.com/blevesearch/bleve/v2 to v2.5.0 (forgejo) (#7468) This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [github.com/blevesearch/bleve/v2](https://github.com/blevesearch/bleve) | require | minor | `v2.4.4` -> `v2.5.0` | --- ### Release Notes <details> <summary>blevesearch/bleve (github.com/blevesearch/bleve/v2)</summary> ### [`v2.5.0`](https://github.com/blevesearch/bleve/releases/tag/v2.5.0) [Compare Source](https://github.com/blevesearch/bleve/compare/v2.4.4...v2.5.0) ##### Bug Fixes - Exact hits to score higher than fuzzy hits, with https://github.com/blevesearch/bleve/pull/2056 - Fix boosting during hybrid search that involves text + nearest neighbor, with https://github.com/blevesearch/bleve/pull/2127 - Addressed bug in IP field handling while highlighting, with https://github.com/blevesearch/bleve/pull/2142 - Graceful error handling within registry, with https://github.com/blevesearch/bleve/pull/2151 - `http/` package (meant for demo purposes) removed from repository to remove vulnerability - [CVE-2022-31022](https://github.com/blevesearch/bleve/security/advisories/GHSA-9w9f-6mg8-jp7w), relocated to within https://github.com/blevesearch/bleve-explorer - Geo radius queries will now advertise distances (within sort values) in readable format, with https://github.com/blevesearch/bleve/pull/2137 ##### Improvements - Vector search requires `faiss` dynamic library to be built from [blevesearch/faiss@352484e](https://github.com/blevesearch/faiss/tree/352484e0fc9d1f8f46737841efe5f26e0f383f71) which is a modified version of [v1.10.0](https://github.com/facebookresearch/faiss/releases/tag/v1.10.0) - Support for **BM25 scoring**, see: [scoring.md](https://github.com/blevesearch/bleve/blob/v2.5.0/docs/scoring.md#bm25) - Support for **synonyms' search**, see: [synonyms.md](https://github.com/blevesearch/bleve/blob/v2.5.0/docs/synonyms.md) - **Significant performance improvements in pre-filtered vector search**, with https://github.com/blevesearch/bleve/pull/2169 + dependent changes - `auto` fuzziness detection with https://github.com/blevesearch/bleve/pull/2060 - Ability to affect ingestion/drain rate by tuning persister workers with https://github.com/blevesearch/bleve/pull/2100 - Additional config in merge policy for improved merger behavior, with https://github.com/blevesearch/bleve/pull/2134 - Geo improvements: footprint reduction for polygons, better validation and graceful error handling, with https://github.com/blevesearch/bleve/pull/2162 + https://github.com/blevesearch/bleve/pull/2158 + https://github.com/blevesearch/bleve/pull/2165 - Upgrade to RoaringBitmap/roaring@v2.4.5, etcd.io/bbolt@v1.4.0 - More metrics ##### Milestone - [v2.5.0](https://github.com/blevesearch/bleve/milestone/24) </details> --- ### Configuration 📅 **Schedule**: Branch creation - "* 0-3 * * *" (UTC), Automerge - "* 0-3 * * *" (UTC). 🚦 **Automerge**: Disabled by config. Please merge this manually once you are satisfied. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] <!-- rebase-check -->If you want to rebase/retry this PR, check this box --- This PR has been generated by [Renovate Bot](https://github.com/renovatebot/renovate). <!--renovate-debug:eyJjcmVhdGVkSW5WZXIiOiIzOS4yMjIuMSIsInVwZGF0ZWRJblZlciI6IjM5LjIyMi4xIiwidGFyZ2V0QnJhbmNoIjoiZm9yZ2VqbyIsImxhYmVscyI6WyJkZXBlbmRlbmN5LXVwZ3JhZGUiLCJ0ZXN0L25vdC1uZWVkZWQiXX0=--> Co-authored-by: Gusted <postmaster@gusted.xyz> Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/7468 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Reviewed-by: Shiny Nematoda <snematoda@noreply.codeberg.org> Co-authored-by: Renovate Bot <forgejo-renovate-action@forgejo.org> Co-committed-by: Renovate Bot <forgejo-renovate-action@forgejo.org>
2025-04-06 08:41:38 +00:00
if err := registry.RegisterTokenizer(Name, TokenizerConstructor); err != nil {
panic(err)
}
}