mirror of
https://codeberg.org/forgejo/forgejo.git
synced 2025-05-16 23:12:43 +00:00
feat: filepath filter for code search (#6143)
Added support for searching content in a specific directory or file. Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/6143 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Reviewed-by: 0ko <0ko@noreply.codeberg.org> Co-authored-by: Shiny Nematoda <snematoda.751k2@aleeas.com> Co-committed-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
This commit is contained in:
parent
bb88e1daf8
commit
ee214cb886
19 changed files with 342 additions and 61 deletions
69
modules/indexer/code/bleve/tokenizer/hierarchy/hierarchy.go
Normal file
69
modules/indexer/code/bleve/tokenizer/hierarchy/hierarchy.go
Normal file
|
@ -0,0 +1,69 @@
|
|||
// Copyright 2024 The Forgejo Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package hierarchy
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "path_hierarchy"
|
||||
|
||||
type PathHierarchyTokenizer struct{}
|
||||
|
||||
// Similar to elastic's path_hierarchy tokenizer
|
||||
// This tokenizes a given path into all the possible hierarchies
|
||||
// For example,
|
||||
// modules/indexer/code/search.go =>
|
||||
//
|
||||
// modules/
|
||||
// modules/indexer
|
||||
// modules/indexer/code
|
||||
// modules/indexer/code/search.go
|
||||
func (t *PathHierarchyTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
// trim any extra slashes
|
||||
input = bytes.Trim(input, "/")
|
||||
|
||||
// zero allocations until the nested directories exceed a depth of 8 (which is unlikely)
|
||||
rv := make(analysis.TokenStream, 0, 8)
|
||||
count, off := 1, 0
|
||||
|
||||
// iterate till all directory seperators
|
||||
for i := bytes.IndexRune(input[off:], '/'); i != -1; i = bytes.IndexRune(input[off:], '/') {
|
||||
// the index is relative to input[offest...]
|
||||
// add this index to the accumlated offset to get the index of the current seperator in input[0...]
|
||||
off += i
|
||||
rv = append(rv, &analysis.Token{
|
||||
Term: input[:off], // take the slice, input[0...index of seperator]
|
||||
Start: 0,
|
||||
End: off,
|
||||
Position: count,
|
||||
Type: analysis.AlphaNumeric,
|
||||
})
|
||||
// increment the offset after considering the seperator
|
||||
off++
|
||||
count++
|
||||
}
|
||||
|
||||
// the entire file path should always be the last token
|
||||
rv = append(rv, &analysis.Token{
|
||||
Term: input,
|
||||
Start: 0,
|
||||
End: len(input),
|
||||
Position: count,
|
||||
Type: analysis.AlphaNumeric,
|
||||
})
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func TokenizerConstructor(config map[string]any, cache *registry.Cache) (analysis.Tokenizer, error) {
|
||||
return &PathHierarchyTokenizer{}, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenizer(Name, TokenizerConstructor)
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
// Copyright 2024 The Forgejo Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package hierarchy
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestIndexerBleveHierarchyTokenizer(t *testing.T) {
|
||||
tokenizer := &PathHierarchyTokenizer{}
|
||||
keywords := []struct {
|
||||
Term string
|
||||
Results []string
|
||||
}{
|
||||
{
|
||||
Term: "modules/indexer/code/search.go",
|
||||
Results: []string{
|
||||
"modules",
|
||||
"modules/indexer",
|
||||
"modules/indexer/code",
|
||||
"modules/indexer/code/search.go",
|
||||
},
|
||||
},
|
||||
{
|
||||
Term: "/tmp/forgejo/",
|
||||
Results: []string{
|
||||
"tmp",
|
||||
"tmp/forgejo",
|
||||
},
|
||||
},
|
||||
{
|
||||
Term: "a/b/c/d/e/f/g/h/i/j",
|
||||
Results: []string{
|
||||
"a",
|
||||
"a/b",
|
||||
"a/b/c",
|
||||
"a/b/c/d",
|
||||
"a/b/c/d/e",
|
||||
"a/b/c/d/e/f",
|
||||
"a/b/c/d/e/f/g",
|
||||
"a/b/c/d/e/f/g/h",
|
||||
"a/b/c/d/e/f/g/h/i",
|
||||
"a/b/c/d/e/f/g/h/i/j",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, kw := range keywords {
|
||||
tokens := tokenizer.Tokenize([]byte(kw.Term))
|
||||
assert.Len(t, tokens, len(kw.Results))
|
||||
for i, token := range tokens {
|
||||
assert.Equal(t, i+1, token.Position)
|
||||
assert.Equal(t, kw.Results[i], string(token.Term))
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue