feat: filepath filter for code search (#6143)

Added support for searching content in a specific directory or file.

Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/6143
Reviewed-by: Gusted <gusted@noreply.codeberg.org>
Reviewed-by: 0ko <0ko@noreply.codeberg.org>
Co-authored-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
Co-committed-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
This commit is contained in:
Shiny Nematoda 2024-12-22 12:24:29 +00:00 committed by 0ko
parent bb88e1daf8
commit ee214cb886
19 changed files with 342 additions and 61 deletions

View file

@ -17,6 +17,7 @@ import (
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/gitrepo"
tokenizer_hierarchy "code.gitea.io/gitea/modules/indexer/code/bleve/tokenizer/hierarchy"
"code.gitea.io/gitea/modules/indexer/code/internal"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
@ -56,6 +57,7 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
type RepoIndexerData struct {
RepoID int64
CommitID string
Filename string
Content string
Language string
UpdatedAt time.Time
@ -69,7 +71,8 @@ func (d *RepoIndexerData) Type() string {
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 6
pathHierarchyAnalyzer = "pathHierarchyAnalyzer"
repoIndexerLatestVersion = 7
)
// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
@ -89,6 +92,11 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
docMapping.AddFieldMappingsAt("Language", termFieldMapping)
docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
pathFieldMapping := bleve.NewTextFieldMapping()
pathFieldMapping.IncludeInAll = false
pathFieldMapping.Analyzer = pathHierarchyAnalyzer
docMapping.AddFieldMappingsAt("Filename", pathFieldMapping)
timeFieldMapping := bleve.NewDateTimeFieldMapping()
timeFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
@ -103,6 +111,13 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
}); err != nil {
return nil, err
} else if err := mapping.AddCustomAnalyzer(pathHierarchyAnalyzer, map[string]any{
"type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": tokenizer_hierarchy.Name,
"token_filters": []string{unicodeNormalizeName},
}); err != nil {
return nil, err
}
mapping.DefaultAnalyzer = repoIndexerAnalyzer
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
@ -178,6 +193,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Filename: update.Filename,
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
@ -266,22 +282,30 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
indexerQuery = keywordQuery
}
opts.Filename = strings.Trim(opts.Filename, "/")
if len(opts.Filename) > 0 {
// we use a keyword analyzer for the query than path hierarchy analyzer
// to match only the exact path
// eg, a query for modules/indexer/code
// should not provide results for modules/ nor modules/indexer
indexerQuery = bleve.NewConjunctionQuery(
indexerQuery,
inner_bleve.MatchQuery(opts.Filename, "Filename", analyzer_keyword.Name, 0),
)
}
// Save for reuse without language filter
facetQuery := indexerQuery
if len(opts.Language) > 0 {
languageQuery := bleve.NewMatchQuery(opts.Language)
languageQuery.FieldVal = "Language"
languageQuery.Analyzer = analyzer_keyword.Name
indexerQuery = bleve.NewConjunctionQuery(
indexerQuery,
languageQuery,
inner_bleve.MatchQuery(opts.Language, "Language", analyzer_keyword.Name, 0),
)
}
from, pageSize := opts.GetSkipTake()
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.Fields = []string{"Content", "RepoID", "Filename", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true
if len(opts.Language) == 0 {
@ -320,7 +344,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
RepoID: int64(hit.Fields["RepoID"].(float64)),
StartIndex: startIndex,
EndIndex: endIndex,
Filename: internal.FilenameOfIndexerID(hit.ID),
Filename: hit.Fields["Filename"].(string),
Content: hit.Fields["Content"].(string),
CommitID: hit.Fields["CommitID"].(string),
UpdatedUnix: updatedUnix,
@ -333,7 +357,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
if len(opts.Language) > 0 {
// Use separate query to go get all language counts
facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
facetRequest.Fields = []string{"Content", "RepoID", "Filename", "Language", "CommitID", "UpdatedAt"}
facetRequest.IncludeLocations = true
facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))

View file

@ -0,0 +1,69 @@
// Copyright 2024 The Forgejo Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package hierarchy
import (
"bytes"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "path_hierarchy"
type PathHierarchyTokenizer struct{}
// Similar to elastic's path_hierarchy tokenizer
// This tokenizes a given path into all the possible hierarchies
// For example,
// modules/indexer/code/search.go =>
//
// modules/
// modules/indexer
// modules/indexer/code
// modules/indexer/code/search.go
func (t *PathHierarchyTokenizer) Tokenize(input []byte) analysis.TokenStream {
// trim any extra slashes
input = bytes.Trim(input, "/")
// zero allocations until the nested directories exceed a depth of 8 (which is unlikely)
rv := make(analysis.TokenStream, 0, 8)
count, off := 1, 0
// iterate till all directory seperators
for i := bytes.IndexRune(input[off:], '/'); i != -1; i = bytes.IndexRune(input[off:], '/') {
// the index is relative to input[offest...]
// add this index to the accumlated offset to get the index of the current seperator in input[0...]
off += i
rv = append(rv, &analysis.Token{
Term: input[:off], // take the slice, input[0...index of seperator]
Start: 0,
End: off,
Position: count,
Type: analysis.AlphaNumeric,
})
// increment the offset after considering the seperator
off++
count++
}
// the entire file path should always be the last token
rv = append(rv, &analysis.Token{
Term: input,
Start: 0,
End: len(input),
Position: count,
Type: analysis.AlphaNumeric,
})
return rv
}
func TokenizerConstructor(config map[string]any, cache *registry.Cache) (analysis.Tokenizer, error) {
return &PathHierarchyTokenizer{}, nil
}
func init() {
registry.RegisterTokenizer(Name, TokenizerConstructor)
}

View file

@ -0,0 +1,59 @@
// Copyright 2024 The Forgejo Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package hierarchy
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestIndexerBleveHierarchyTokenizer(t *testing.T) {
tokenizer := &PathHierarchyTokenizer{}
keywords := []struct {
Term string
Results []string
}{
{
Term: "modules/indexer/code/search.go",
Results: []string{
"modules",
"modules/indexer",
"modules/indexer/code",
"modules/indexer/code/search.go",
},
},
{
Term: "/tmp/forgejo/",
Results: []string{
"tmp",
"tmp/forgejo",
},
},
{
Term: "a/b/c/d/e/f/g/h/i/j",
Results: []string{
"a",
"a/b",
"a/b/c",
"a/b/c/d",
"a/b/c/d/e",
"a/b/c/d/e/f",
"a/b/c/d/e/f/g",
"a/b/c/d/e/f/g/h",
"a/b/c/d/e/f/g/h/i",
"a/b/c/d/e/f/g/h/i/j",
},
},
}
for _, kw := range keywords {
tokens := tokenizer.Tokenize([]byte(kw.Term))
assert.Len(t, tokens, len(kw.Results))
for i, token := range tokens {
assert.Equal(t, i+1, token.Position)
assert.Equal(t, kw.Results[i], string(token.Term))
}
}
}