From 905a5748a8c80b6dede63cbda5c4504c2fafd850 Mon Sep 17 00:00:00 2001 From: Danko Aleksejevs Date: Wed, 4 Jun 2025 07:42:29 +0200 Subject: [PATCH] Add issue number to the search index, rank number and title matches higher (#7956) (#7968) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An attempt at solving #7956. This (and rebuilding the index) seems enough to ensure the issue *appears* among the results. However, I couldn't figure out from [bleve docs](https://github.com/blevesearch/bleve/blob/master/docs/scoring.md) how to affect the scoring based on specific fields, or whether that is possible at all. Disclaimer: I've never written Go before, sorry 😅 take it as a quick PoC more than anything. ### Tests - I added test coverage for Go changes... - [x] in their respective `*_test.go` for unit tests. - [ ] in the `tests/integration` directory if it involves interactions with a live Forgejo server. - I added test coverage for JavaScript changes... - [ ] in `web_src/js/*.test.js` if it can be unit tested. - [ ] in `tests/e2e/*.test.e2e.js` if it requires interactions with a live Forgejo server (see also the [developer guide for JavaScript testing](https://codeberg.org/forgejo/forgejo/src/branch/forgejo/tests/e2e/README.md#end-to-end-tests)). ### Documentation - [ ] I created a pull request [to the documentation](https://codeberg.org/forgejo/docs) to explain to Forgejo users how to use this change. - [x] I did not document these changes and I do not expect someone else to do it. ### Release notes - [ ] I do not want this change to show in the release notes. - [x] I want the title to show in the release notes with a link to this pull request. - [ ] I want the content of the `release-notes/.md` to be be used for the release notes instead of the title. ## Release notes - Features - [PR](https://codeberg.org/forgejo/forgejo/pulls/7968): Add issue number to the search index, rank number and title matches higher (#7956) Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/7968 Reviewed-by: Shiny Nematoda Co-authored-by: Danko Aleksejevs Co-committed-by: Danko Aleksejevs --- modules/indexer/code/bleve/bleve.go | 4 +- modules/indexer/internal/bleve/query.go | 3 +- modules/indexer/issues/bleve/bleve.go | 15 ++++-- modules/indexer/issues/db/db.go | 12 +++++ .../issues/elasticsearch/elasticsearch.go | 19 ++++--- modules/indexer/issues/internal/model.go | 1 + modules/indexer/issues/internal/qstring.go | 9 ++++ .../indexer/issues/internal/tests/tests.go | 50 +++++++++++++++++++ modules/indexer/issues/util.go | 1 + 9 files changed, 100 insertions(+), 14 deletions(-) diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index eb003baec7..c53b7a2e6d 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -260,11 +260,11 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int if opts.Mode == internal.CodeSearchModeUnion { query := bleve.NewDisjunctionQuery() for _, field := range strings.Fields(opts.Keyword) { - query.AddQuery(inner_bleve.MatchPhraseQuery(field, "Content", repoIndexerAnalyzer, false)) + query.AddQuery(inner_bleve.MatchPhraseQuery(field, "Content", repoIndexerAnalyzer, false, 1.0)) } keywordQuery = query } else { - keywordQuery = inner_bleve.MatchPhraseQuery(opts.Keyword, "Content", repoIndexerAnalyzer, false) + keywordQuery = inner_bleve.MatchPhraseQuery(opts.Keyword, "Content", repoIndexerAnalyzer, false, 1.0) } if len(opts.RepoIDs) > 0 { diff --git a/modules/indexer/internal/bleve/query.go b/modules/indexer/internal/bleve/query.go index 7f411b516b..e043023671 100644 --- a/modules/indexer/internal/bleve/query.go +++ b/modules/indexer/internal/bleve/query.go @@ -29,11 +29,12 @@ func MatchQuery(matchTerm, field, analyzer string, fuzziness int) *query.MatchQu } // MatchPhraseQuery generates a match phrase query for the given phrase, field and analyzer -func MatchPhraseQuery(matchPhrase, field, analyzer string, autoFuzzy bool) *query.MatchPhraseQuery { +func MatchPhraseQuery(matchPhrase, field, analyzer string, autoFuzzy bool, boost float64) *query.MatchPhraseQuery { q := bleve.NewMatchPhraseQuery(matchPhrase) q.FieldVal = field q.Analyzer = analyzer q.SetAutoFuzziness(autoFuzzy) + q.SetBoost(boost) return q } diff --git a/modules/indexer/issues/bleve/bleve.go b/modules/indexer/issues/bleve/bleve.go index 64d3c8122e..573d63a446 100644 --- a/modules/indexer/issues/bleve/bleve.go +++ b/modules/indexer/issues/bleve/bleve.go @@ -23,7 +23,7 @@ import ( const ( issueIndexerAnalyzer = "issueIndexer" issueIndexerDocType = "issueIndexerDocType" - issueIndexerLatestVersion = 4 + issueIndexerLatestVersion = 5 ) const unicodeNormalizeName = "unicodeNormalize" @@ -69,6 +69,7 @@ func generateIssueIndexMapping() (mapping.IndexMapping, error) { docMapping.AddFieldMappingsAt("is_public", boolFieldMapping) + docMapping.AddFieldMappingsAt("index", numberFieldMapping) docMapping.AddFieldMappingsAt("title", textFieldMapping) docMapping.AddFieldMappingsAt("content", textFieldMapping) docMapping.AddFieldMappingsAt("comments", textFieldMapping) @@ -163,9 +164,15 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) ( q := bleve.NewBooleanQuery() for _, token := range tokens { innerQ := bleve.NewDisjunctionQuery( - inner_bleve.MatchPhraseQuery(token.Term, "title", issueIndexerAnalyzer, token.Fuzzy), - inner_bleve.MatchPhraseQuery(token.Term, "content", issueIndexerAnalyzer, token.Fuzzy), - inner_bleve.MatchPhraseQuery(token.Term, "comments", issueIndexerAnalyzer, token.Fuzzy)) + inner_bleve.MatchPhraseQuery(token.Term, "title", issueIndexerAnalyzer, token.Fuzzy, 2.0), + inner_bleve.MatchPhraseQuery(token.Term, "content", issueIndexerAnalyzer, token.Fuzzy, 1.0), + inner_bleve.MatchPhraseQuery(token.Term, "comments", issueIndexerAnalyzer, token.Fuzzy, 1.0)) + + if issueID, err := token.ParseIssueReference(); err == nil { + idQuery := inner_bleve.NumericEqualityQuery(issueID, "index") + idQuery.SetBoost(5.0) + innerQ.AddQuery(idQuery) + } switch token.Kind { case internal.BoolOptMust: diff --git a/modules/indexer/issues/db/db.go b/modules/indexer/issues/db/db.go index 9dd026e74f..397daa3265 100644 --- a/modules/indexer/issues/db/db.go +++ b/modules/indexer/issues/db/db.go @@ -5,6 +5,7 @@ package db import ( "context" + "strconv" "forgejo.org/models/db" issue_model "forgejo.org/models/issues" @@ -71,6 +72,17 @@ func (i *Indexer) Search(ctx context.Context, options *internal.SearchOptions) ( )), ), ) + + term := options.Keyword + if term[0] == '#' || term[0] == '!' { + term = term[1:] + } + if issueID, err := strconv.ParseInt(term, 10, 64); err == nil { + cond = builder.Or( + builder.Eq{"`index`": issueID}, + cond, + ) + } } opt, err := ToDBOptions(ctx, options) diff --git a/modules/indexer/issues/elasticsearch/elasticsearch.go b/modules/indexer/issues/elasticsearch/elasticsearch.go index 1bf0145796..9d2786e101 100644 --- a/modules/indexer/issues/elasticsearch/elasticsearch.go +++ b/modules/indexer/issues/elasticsearch/elasticsearch.go @@ -18,7 +18,7 @@ import ( ) const ( - issueIndexerLatestVersion = 1 + issueIndexerLatestVersion = 2 // multi-match-types, currently only 2 types are used // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types esMultiMatchTypeBestFields = "best_fields" @@ -56,7 +56,8 @@ const ( "repo_id": { "type": "long", "index": true }, "is_public": { "type": "boolean", "index": true }, - "title": { "type": "text", "index": true }, + "index": { "type": "long", "index": true }, + "title": { "type": "text", "index": true }, "content": { "type": "text", "index": true }, "comments": { "type" : "text", "index": true }, @@ -155,21 +156,25 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) ( return nil, err } for _, token := range tokens { - innerQ := elastic.NewMultiMatchQuery(token.Term, "title", "content", "comments") + innerQ := elastic.NewMultiMatchQuery(token.Term, "content", "comments").FieldWithBoost("title", 2.0).TieBreaker(0.5) if token.Fuzzy { // If the term is not a phrase use fuzziness set to AUTO innerQ = innerQ.Type(esMultiMatchTypeBestFields).Fuzziness(esFuzzyAuto) } else { innerQ = innerQ.Type(esMultiMatchTypePhrasePrefix) } - + var eitherQ elastic.Query = innerQ + if issueID, err := token.ParseIssueReference(); err == nil { + indexQ := elastic.NewTermQuery("index", issueID).Boost(15.0) + eitherQ = elastic.NewDisMaxQuery().Query(indexQ).Query(innerQ).TieBreaker(0.5) + } switch token.Kind { case internal.BoolOptMust: - q.Must(innerQ) + q.Must(eitherQ) case internal.BoolOptShould: - q.Should(innerQ) + q.Should(eitherQ) case internal.BoolOptNot: - q.MustNot(innerQ) + q.MustNot(eitherQ) } } query.Must(q) diff --git a/modules/indexer/issues/internal/model.go b/modules/indexer/issues/internal/model.go index 03f5595a5b..6c55405179 100644 --- a/modules/indexer/issues/internal/model.go +++ b/modules/indexer/issues/internal/model.go @@ -14,6 +14,7 @@ type IndexerData struct { ID int64 `json:"id"` RepoID int64 `json:"repo_id"` IsPublic bool `json:"is_public"` // If the repo is public + Index int64 `json:"index"` // Fields used for keyword searching Title string `json:"title"` diff --git a/modules/indexer/issues/internal/qstring.go b/modules/indexer/issues/internal/qstring.go index fdb89b09e9..8115fc904f 100644 --- a/modules/indexer/issues/internal/qstring.go +++ b/modules/indexer/issues/internal/qstring.go @@ -5,6 +5,7 @@ package internal import ( "io" + "strconv" "strings" ) @@ -22,6 +23,14 @@ type Token struct { Fuzzy bool } +func (tk *Token) ParseIssueReference() (int64, error) { + term := tk.Term + if term[0] == '#' || term[0] == '!' { + term = term[1:] + } + return strconv.ParseInt(term, 10, 64) +} + type Tokenizer struct { in *strings.Reader } diff --git a/modules/indexer/issues/internal/tests/tests.go b/modules/indexer/issues/internal/tests/tests.go index 1e871c4646..ef75955a14 100644 --- a/modules/indexer/issues/internal/tests/tests.go +++ b/modules/indexer/issues/internal/tests/tests.go @@ -549,6 +549,55 @@ var cases = []*testIndexerCase{ }), result.Total) }, }, + { + Name: "Index", + SearchOptions: &internal.SearchOptions{ + Keyword: "13", + SortBy: internal.SortByScore, + RepoIDs: []int64{5}, + }, + ExpectedIDs: []int64{93}, // 93 = #13 in repo 5 + ExpectedTotal: 1, + }, + { + Name: "Index with prefix", + SearchOptions: &internal.SearchOptions{ + Keyword: "#13", + SortBy: internal.SortByScore, + RepoIDs: []int64{5}, + }, + ExpectedIDs: []int64{93}, + ExpectedTotal: 1, + }, + { + Name: "Index and title boost", + ExtraData: []*internal.IndexerData{ + {ID: 1001, Title: "re #13", RepoID: 5}, + {ID: 1002, Title: "re #1001", Content: "leave 13 alone. - 13", RepoID: 5}, + }, + SearchOptions: &internal.SearchOptions{ + Keyword: "!13", + SortBy: internal.SortByScore, + RepoIDs: []int64{5}, + }, + ExpectedIDs: []int64{93, 1001, 1002}, + ExpectedTotal: 3, + }, + { + Name: "Index exclude", + ExtraData: []*internal.IndexerData{ + {ID: 1001, Index: 101, Title: "Brrr", RepoID: 5}, + {ID: 1002, Index: 102, Title: "Brrr", Content: "Brrr", RepoID: 5}, + {ID: 1003, Index: 103, Title: "Brrr", RepoID: 5}, + {ID: 1004, Index: 104, Title: "Brrr", RepoID: 5}, + }, + SearchOptions: &internal.SearchOptions{ + Keyword: "Brrr -101 -103", + SortBy: internal.SortByScore, + }, + ExpectedIDs: []int64{1002, 1004}, + ExpectedTotal: 2, + }, { Name: "SortByCreatedDesc", SearchOptions: &internal.SearchOptions{ @@ -741,6 +790,7 @@ func generateDefaultIndexerData() []*internal.IndexerData { data = append(data, &internal.IndexerData{ ID: id, + Index: issueIndex, RepoID: repoID, IsPublic: repoID%2 == 0, Title: fmt.Sprintf("issue%d of repo%d", issueIndex, repoID), diff --git a/modules/indexer/issues/util.go b/modules/indexer/issues/util.go index 3e6c8babe4..909e840ae5 100644 --- a/modules/indexer/issues/util.go +++ b/modules/indexer/issues/util.go @@ -95,6 +95,7 @@ func getIssueIndexerData(ctx context.Context, issueID int64) (*internal.IndexerD return &internal.IndexerData{ ID: issue.ID, RepoID: issue.RepoID, + Index: issue.Index, IsPublic: !issue.Repo.IsPrivate, Title: issue.Title, Content: issue.Content,