feat(code search): replace fuzzy search with union search for indexer (#6947)
Some checks are pending
/ release (push) Waiting to run
testing / backend-checks (push) Has been skipped
testing / frontend-checks (push) Has been skipped
testing / test-remote-cacher (redis) (push) Has been skipped
testing / test-remote-cacher (valkey) (push) Has been skipped
testing / test-remote-cacher (garnet) (push) Has been skipped
testing / test-remote-cacher (redict) (push) Has been skipped
testing / test-unit (push) Has been skipped
testing / test-e2e (push) Has been skipped
testing / test-mysql (push) Has been skipped
testing / test-pgsql (push) Has been skipped
testing / test-sqlite (push) Has been skipped
testing / security-check (push) Has been skipped

Fuzzy searching for code has been known to be problematic #5264 and in my personal opinion isn't very useful.

Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/6947
Reviewed-by: Gusted <gusted@noreply.codeberg.org>
Co-authored-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
Co-committed-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
This commit is contained in:
Shiny Nematoda 2025-03-11 21:22:51 +00:00 committed by Gusted
parent cb46a036aa
commit 3816db68aa
10 changed files with 105 additions and 86 deletions

View file

@ -40,10 +40,6 @@ import (
const (
unicodeNormalizeName = "unicodeNormalize"
maxBatchSize = 16
// fuzzyDenominator determines the levenshtein distance per each character of a keyword
fuzzyDenominator = 4
// see https://github.com/blevesearch/bleve/issues/1563#issuecomment-786822311
maxFuzziness = 2
)
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
@ -260,12 +256,14 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
keywordQuery query.Query
)
phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
keywordQuery = phraseQuery
if opts.IsKeywordFuzzy {
phraseQuery.Fuzziness = min(maxFuzziness, len(opts.Keyword)/fuzzyDenominator)
if opts.Mode == internal.CodeSearchModeUnion {
query := bleve.NewDisjunctionQuery()
for _, field := range strings.Fields(opts.Keyword) {
query.AddQuery(inner_bleve.MatchPhraseQuery(field, "Content", repoIndexerAnalyzer, 0))
}
keywordQuery = query
} else {
keywordQuery = inner_bleve.MatchPhraseQuery(opts.Keyword, "Content", repoIndexerAnalyzer, 0)
}
if len(opts.RepoIDs) > 0 {
@ -325,13 +323,16 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
for i, hit := range result.Hits {
startIndex, endIndex := -1, -1
for _, locations := range hit.Locations["Content"] {
if startIndex != -1 && endIndex != -1 {
break
}
location := locations[0]
locationStart := int(location.Start)
locationEnd := int(location.End)
if startIndex < 0 || locationStart < startIndex {
startIndex = locationStart
}
if endIndex < 0 || locationEnd > endIndex {
if endIndex < 0 && locationEnd > endIndex {
endIndex = locationEnd
}
}