feat: filepath filter for code search (#6143)

Added support for searching content in a specific directory or file. Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/6143 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Reviewed-by: 0ko <0ko@noreply.codeberg.org> Co-authored-by: Shiny Nematoda <snematoda.751k2@aleeas.com> Co-committed-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
2025-05-16 23:12:43 +00:00 · 2024-12-22 12:24:29 +00:00 · 2024-12-22 12:24:29 +00:00 · ee214cb886
commit ee214cb886
parent bb88e1daf8
19 changed files with 342 additions and 61 deletions
--- a/modules/indexer/code/bleve/tokenizer/hierarchy/hierarchy.go
+++ b/modules/indexer/code/bleve/tokenizer/hierarchy/hierarchy.go
@ -0,0 +1,69 @@
+// Copyright 2024 The Forgejo Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package hierarchy
+
+import (
+	"bytes"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "path_hierarchy"
+
+type PathHierarchyTokenizer struct{}
+
+// Similar to elastic's path_hierarchy tokenizer
+// This tokenizes a given path into all the possible hierarchies
+// For example,
+// modules/indexer/code/search.go =>
+//
+//	modules/
+//	modules/indexer
+//	modules/indexer/code
+//	modules/indexer/code/search.go
+func (t *PathHierarchyTokenizer) Tokenize(input []byte) analysis.TokenStream {
+	// trim any extra slashes
+	input = bytes.Trim(input, "/")
+
+	// zero allocations until the nested directories exceed a depth of 8 (which is unlikely)
+	rv := make(analysis.TokenStream, 0, 8)
+	count, off := 1, 0
+
+	// iterate till all directory seperators
+	for i := bytes.IndexRune(input[off:], '/'); i != -1; i = bytes.IndexRune(input[off:], '/') {
+		// the index is relative to input[offest...]
+		// add this index to the accumlated offset to get the index of the current seperator in input[0...]
+		off += i
+		rv = append(rv, &analysis.Token{
+			Term:     input[:off], // take the slice, input[0...index of seperator]
+			Start:    0,
+			End:      off,
+			Position: count,
+			Type:     analysis.AlphaNumeric,
+		})
+		// increment the offset after considering the seperator
+		off++
+		count++
+	}
+
+	// the entire file path should always be the last token
+	rv = append(rv, &analysis.Token{
+		Term:     input,
+		Start:    0,
+		End:      len(input),
+		Position: count,
+		Type:     analysis.AlphaNumeric,
+	})
+
+	return rv
+}
+
+func TokenizerConstructor(config map[string]any, cache *registry.Cache) (analysis.Tokenizer, error) {
+	return &PathHierarchyTokenizer{}, nil
+}
+
+func init() {
+	registry.RegisterTokenizer(Name, TokenizerConstructor)
+}
--- a/modules/indexer/code/bleve/tokenizer/hierarchy/hierarchy_test.go
+++ b/modules/indexer/code/bleve/tokenizer/hierarchy/hierarchy_test.go
@ -0,0 +1,59 @@
+// Copyright 2024 The Forgejo Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package hierarchy
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestIndexerBleveHierarchyTokenizer(t *testing.T) {
+	tokenizer := &PathHierarchyTokenizer{}
+	keywords := []struct {
+		Term    string
+		Results []string
+	}{
+		{
+			Term: "modules/indexer/code/search.go",
+			Results: []string{
+				"modules",
+				"modules/indexer",
+				"modules/indexer/code",
+				"modules/indexer/code/search.go",
+			},
+		},
+		{
+			Term: "/tmp/forgejo/",
+			Results: []string{
+				"tmp",
+				"tmp/forgejo",
+			},
+		},
+		{
+			Term: "a/b/c/d/e/f/g/h/i/j",
+			Results: []string{
+				"a",
+				"a/b",
+				"a/b/c",
+				"a/b/c/d",
+				"a/b/c/d/e",
+				"a/b/c/d/e/f",
+				"a/b/c/d/e/f/g",
+				"a/b/c/d/e/f/g/h",
+				"a/b/c/d/e/f/g/h/i",
+				"a/b/c/d/e/f/g/h/i/j",
+			},
+		},
+	}
+
+	for _, kw := range keywords {
+		tokens := tokenizer.Tokenize([]byte(kw.Term))
+		assert.Len(t, tokens, len(kw.Results))
+		for i, token := range tokens {
+			assert.Equal(t, i+1, token.Position)
+			assert.Equal(t, kw.Results[i], string(token.Term))
+		}
+	}
+}