Add issue number to the search index, rank number and title matches higher (#7956) (#7968)

An attempt at solving #7956. This (and rebuilding the index) seems enough to ensure the issue *appears* among the results.

However, I couldn't figure out from [bleve docs](https://github.com/blevesearch/bleve/blob/master/docs/scoring.md) how to affect the scoring based on specific fields, or whether that is possible at all.

Disclaimer: I've never written Go before, sorry 😅 take it as a quick PoC more than anything.

### Tests

- I added test coverage for Go changes...
  - [x] in their respective `*_test.go` for unit tests.
  - [ ] in the `tests/integration` directory if it involves interactions with a live Forgejo server.
- I added test coverage for JavaScript changes...
  - [ ] in `web_src/js/*.test.js` if it can be unit tested.
  - [ ] in `tests/e2e/*.test.e2e.js` if it requires interactions with a live Forgejo server (see also the [developer guide for JavaScript testing](https://codeberg.org/forgejo/forgejo/src/branch/forgejo/tests/e2e/README.md#end-to-end-tests)).

### Documentation

- [ ] I created a pull request [to the documentation](https://codeberg.org/forgejo/docs) to explain to Forgejo users how to use this change.
- [x] I did not document these changes and I do not expect someone else to do it.

### Release notes

- [ ] I do not want this change to show in the release notes.
- [x] I want the title to show in the release notes with a link to this pull request.
- [ ] I want the content of the `release-notes/<pull request number>.md` to be be used for the release notes instead of the title.

<!--start release-notes-assistant-->

## Release notes
<!--URL:https://codeberg.org/forgejo/forgejo-->
- Features
  - [PR](https://codeberg.org/forgejo/forgejo/pulls/7968): <!--number 7968 --><!--line 0 --><!--description QWRkIGlzc3VlIG51bWJlciB0byB0aGUgc2VhcmNoIGluZGV4LCByYW5rIG51bWJlciBhbmQgdGl0bGUgbWF0Y2hlcyBoaWdoZXIgKCM3OTU2KQ==-->Add issue number to the search index, rank number and title matches higher (#7956)<!--description-->
<!--end release-notes-assistant-->

Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/7968
Reviewed-by: Shiny Nematoda <snematoda@noreply.codeberg.org>
Co-authored-by: Danko Aleksejevs <danko@very.lv>
Co-committed-by: Danko Aleksejevs <danko@very.lv>
This commit is contained in:
Danko Aleksejevs 2025-06-04 07:42:29 +02:00 committed by Earl Warren
parent 2529923dea
commit 905a5748a8
9 changed files with 100 additions and 14 deletions

View file

@ -260,11 +260,11 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
if opts.Mode == internal.CodeSearchModeUnion {
query := bleve.NewDisjunctionQuery()
for _, field := range strings.Fields(opts.Keyword) {
query.AddQuery(inner_bleve.MatchPhraseQuery(field, "Content", repoIndexerAnalyzer, false))
query.AddQuery(inner_bleve.MatchPhraseQuery(field, "Content", repoIndexerAnalyzer, false, 1.0))
}
keywordQuery = query
} else {
keywordQuery = inner_bleve.MatchPhraseQuery(opts.Keyword, "Content", repoIndexerAnalyzer, false)
keywordQuery = inner_bleve.MatchPhraseQuery(opts.Keyword, "Content", repoIndexerAnalyzer, false, 1.0)
}
if len(opts.RepoIDs) > 0 {

View file

@ -29,11 +29,12 @@ func MatchQuery(matchTerm, field, analyzer string, fuzziness int) *query.MatchQu
}
// MatchPhraseQuery generates a match phrase query for the given phrase, field and analyzer
func MatchPhraseQuery(matchPhrase, field, analyzer string, autoFuzzy bool) *query.MatchPhraseQuery {
func MatchPhraseQuery(matchPhrase, field, analyzer string, autoFuzzy bool, boost float64) *query.MatchPhraseQuery {
q := bleve.NewMatchPhraseQuery(matchPhrase)
q.FieldVal = field
q.Analyzer = analyzer
q.SetAutoFuzziness(autoFuzzy)
q.SetBoost(boost)
return q
}

View file

@ -23,7 +23,7 @@ import (
const (
issueIndexerAnalyzer = "issueIndexer"
issueIndexerDocType = "issueIndexerDocType"
issueIndexerLatestVersion = 4
issueIndexerLatestVersion = 5
)
const unicodeNormalizeName = "unicodeNormalize"
@ -69,6 +69,7 @@ func generateIssueIndexMapping() (mapping.IndexMapping, error) {
docMapping.AddFieldMappingsAt("is_public", boolFieldMapping)
docMapping.AddFieldMappingsAt("index", numberFieldMapping)
docMapping.AddFieldMappingsAt("title", textFieldMapping)
docMapping.AddFieldMappingsAt("content", textFieldMapping)
docMapping.AddFieldMappingsAt("comments", textFieldMapping)
@ -163,9 +164,15 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
q := bleve.NewBooleanQuery()
for _, token := range tokens {
innerQ := bleve.NewDisjunctionQuery(
inner_bleve.MatchPhraseQuery(token.Term, "title", issueIndexerAnalyzer, token.Fuzzy),
inner_bleve.MatchPhraseQuery(token.Term, "content", issueIndexerAnalyzer, token.Fuzzy),
inner_bleve.MatchPhraseQuery(token.Term, "comments", issueIndexerAnalyzer, token.Fuzzy))
inner_bleve.MatchPhraseQuery(token.Term, "title", issueIndexerAnalyzer, token.Fuzzy, 2.0),
inner_bleve.MatchPhraseQuery(token.Term, "content", issueIndexerAnalyzer, token.Fuzzy, 1.0),
inner_bleve.MatchPhraseQuery(token.Term, "comments", issueIndexerAnalyzer, token.Fuzzy, 1.0))
if issueID, err := token.ParseIssueReference(); err == nil {
idQuery := inner_bleve.NumericEqualityQuery(issueID, "index")
idQuery.SetBoost(5.0)
innerQ.AddQuery(idQuery)
}
switch token.Kind {
case internal.BoolOptMust:

View file

@ -5,6 +5,7 @@ package db
import (
"context"
"strconv"
"forgejo.org/models/db"
issue_model "forgejo.org/models/issues"
@ -71,6 +72,17 @@ func (i *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
)),
),
)
term := options.Keyword
if term[0] == '#' || term[0] == '!' {
term = term[1:]
}
if issueID, err := strconv.ParseInt(term, 10, 64); err == nil {
cond = builder.Or(
builder.Eq{"`index`": issueID},
cond,
)
}
}
opt, err := ToDBOptions(ctx, options)

View file

@ -18,7 +18,7 @@ import (
)
const (
issueIndexerLatestVersion = 1
issueIndexerLatestVersion = 2
// multi-match-types, currently only 2 types are used
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
esMultiMatchTypeBestFields = "best_fields"
@ -56,7 +56,8 @@ const (
"repo_id": { "type": "long", "index": true },
"is_public": { "type": "boolean", "index": true },
"title": { "type": "text", "index": true },
"index": { "type": "long", "index": true },
"title": { "type": "text", "index": true },
"content": { "type": "text", "index": true },
"comments": { "type" : "text", "index": true },
@ -155,21 +156,25 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
return nil, err
}
for _, token := range tokens {
innerQ := elastic.NewMultiMatchQuery(token.Term, "title", "content", "comments")
innerQ := elastic.NewMultiMatchQuery(token.Term, "content", "comments").FieldWithBoost("title", 2.0).TieBreaker(0.5)
if token.Fuzzy {
// If the term is not a phrase use fuzziness set to AUTO
innerQ = innerQ.Type(esMultiMatchTypeBestFields).Fuzziness(esFuzzyAuto)
} else {
innerQ = innerQ.Type(esMultiMatchTypePhrasePrefix)
}
var eitherQ elastic.Query = innerQ
if issueID, err := token.ParseIssueReference(); err == nil {
indexQ := elastic.NewTermQuery("index", issueID).Boost(15.0)
eitherQ = elastic.NewDisMaxQuery().Query(indexQ).Query(innerQ).TieBreaker(0.5)
}
switch token.Kind {
case internal.BoolOptMust:
q.Must(innerQ)
q.Must(eitherQ)
case internal.BoolOptShould:
q.Should(innerQ)
q.Should(eitherQ)
case internal.BoolOptNot:
q.MustNot(innerQ)
q.MustNot(eitherQ)
}
}
query.Must(q)

View file

@ -14,6 +14,7 @@ type IndexerData struct {
ID int64 `json:"id"`
RepoID int64 `json:"repo_id"`
IsPublic bool `json:"is_public"` // If the repo is public
Index int64 `json:"index"`
// Fields used for keyword searching
Title string `json:"title"`

View file

@ -5,6 +5,7 @@ package internal
import (
"io"
"strconv"
"strings"
)
@ -22,6 +23,14 @@ type Token struct {
Fuzzy bool
}
func (tk *Token) ParseIssueReference() (int64, error) {
term := tk.Term
if term[0] == '#' || term[0] == '!' {
term = term[1:]
}
return strconv.ParseInt(term, 10, 64)
}
type Tokenizer struct {
in *strings.Reader
}

View file

@ -549,6 +549,55 @@ var cases = []*testIndexerCase{
}), result.Total)
},
},
{
Name: "Index",
SearchOptions: &internal.SearchOptions{
Keyword: "13",
SortBy: internal.SortByScore,
RepoIDs: []int64{5},
},
ExpectedIDs: []int64{93}, // 93 = #13 in repo 5
ExpectedTotal: 1,
},
{
Name: "Index with prefix",
SearchOptions: &internal.SearchOptions{
Keyword: "#13",
SortBy: internal.SortByScore,
RepoIDs: []int64{5},
},
ExpectedIDs: []int64{93},
ExpectedTotal: 1,
},
{
Name: "Index and title boost",
ExtraData: []*internal.IndexerData{
{ID: 1001, Title: "re #13", RepoID: 5},
{ID: 1002, Title: "re #1001", Content: "leave 13 alone. - 13", RepoID: 5},
},
SearchOptions: &internal.SearchOptions{
Keyword: "!13",
SortBy: internal.SortByScore,
RepoIDs: []int64{5},
},
ExpectedIDs: []int64{93, 1001, 1002},
ExpectedTotal: 3,
},
{
Name: "Index exclude",
ExtraData: []*internal.IndexerData{
{ID: 1001, Index: 101, Title: "Brrr", RepoID: 5},
{ID: 1002, Index: 102, Title: "Brrr", Content: "Brrr", RepoID: 5},
{ID: 1003, Index: 103, Title: "Brrr", RepoID: 5},
{ID: 1004, Index: 104, Title: "Brrr", RepoID: 5},
},
SearchOptions: &internal.SearchOptions{
Keyword: "Brrr -101 -103",
SortBy: internal.SortByScore,
},
ExpectedIDs: []int64{1002, 1004},
ExpectedTotal: 2,
},
{
Name: "SortByCreatedDesc",
SearchOptions: &internal.SearchOptions{
@ -741,6 +790,7 @@ func generateDefaultIndexerData() []*internal.IndexerData {
data = append(data, &internal.IndexerData{
ID: id,
Index: issueIndex,
RepoID: repoID,
IsPublic: repoID%2 == 0,
Title: fmt.Sprintf("issue%d of repo%d", issueIndex, repoID),

View file

@ -95,6 +95,7 @@ func getIssueIndexerData(ctx context.Context, issueID int64) (*internal.IndexerD
return &internal.IndexerData{
ID: issue.ID,
RepoID: issue.RepoID,
Index: issue.Index,
IsPublic: !issue.Repo.IsPrivate,
Title: issue.Title,
Content: issue.Content,