Skip to main content

lucene 10源码分析

· 15 min read

背景

我家里的电脑的lucene是10版本的

创建索引和保存

### 断点
java -agentlib:jdwp=transport=dt_socket,server=y,address=8000 -cp /home/dai/lucene/lucene/demo/build/libs/lucene-demo-10.0.0-SNAPSHOT.jar:/home/dai/lucene/lucene/core/build/libs/lucene-core-10.0.0-SNAPSHOT.jar org.apache.lucene.demo.IndexFiles -docs /home/dai/docs
### jdb 调试
jdb -attach 8000 -sourcepath /home/dai/lucene/lucene/demo/src/java/:/home/dai/lucene/lucene/core/src/java/

分词

倒排索引和分词都在这块代码

main[1] where
[1] org.apache.lucene.index.IndexingChain$PerField.invert (IndexingChain.java:1,140)
[2] org.apache.lucene.index.IndexingChain.processField (IndexingChain.java:729)
[3] org.apache.lucene.index.IndexingChain.processDocument (IndexingChain.java:620)
[4] org.apache.lucene.index.DocumentsWriterPerThread.updateDocuments (DocumentsWriterPerThread.java:241)
[5] org.apache.lucene.index.DocumentsWriter.updateDocuments (DocumentsWriter.java:432)
[6] org.apache.lucene.index.IndexWriter.updateDocuments (IndexWriter.java:1,531)
[7] org.apache.lucene.index.IndexWriter.updateDocument (IndexWriter.java:1,816)
[8] org.apache.lucene.index.IndexWriter.addDocument (IndexWriter.java:1,469)
[9] org.apache.lucene.demo.IndexFiles.indexDoc (IndexFiles.java:271)
[10] org.apache.lucene.demo.IndexFiles$1.visitFile (IndexFiles.java:212)
[11] org.apache.lucene.demo.IndexFiles$1.visitFile (IndexFiles.java:208)
[12] java.nio.file.Files.walkFileTree (Files.java:2,811)
[13] java.nio.file.Files.walkFileTree (Files.java:2,882)
[14] org.apache.lucene.demo.IndexFiles.indexDocs (IndexFiles.java:206)
[15] org.apache.lucene.demo.IndexFiles.main (IndexFiles.java:157)

Step completed: "thread=main", org.apache.lucene.index.TermsHashPerField.add(), line=193 bci=22
193 int termID = bytesHash.add(termBytes);

main[1] print termBytes
termBytes = "[2f 68 6f 6d 65 2f 64 61 69 2f 64 6f 63 73 2f 62 62 62 2e 74 78 74]"

invert

倒排索引,核心是构造一个term=>doc 的映射。比较核心的类是lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java,这是

  @Override
void addTerm(final int termID, final int docID) {
final FreqProxPostingsArray postings = freqProxPostingsArray;
assert !hasFreq || postings.termFreqs[termID] > 0;

if (!hasFreq) {
assert postings.termFreqs == null;
if (termFreqAtt.getTermFrequency() != 1) {
throw new IllegalStateException(
"field \""
+ getFieldName()
+ "\": must index term freq while using custom TermFrequencyAttribute");
}
if (docID != postings.lastDocIDs[termID]) {
// New document; now encode docCode for previous doc:
assert docID > postings.lastDocIDs[termID];
writeVInt(0, postings.lastDocCodes[termID]);
postings.lastDocCodes[termID] = docID - postings.lastDocIDs[termID];
postings.lastDocIDs[termID] = docID;
fieldState.uniqueTermCount++;
}
} else if (docID != postings.lastDocIDs[termID]) {
assert docID > postings.lastDocIDs[termID]
: "id: " + docID + " postings ID: " + postings.lastDocIDs[termID] + " termID: " + termID;
// Term not yet seen in the current doc but previously
// seen in other doc(s) since the last flush

// Now that we know doc freq for previous doc,
// write it & lastDocCode
if (1 == postings.termFreqs[termID]) {
writeVInt(0, postings.lastDocCodes[termID] | 1);
} else {
writeVInt(0, postings.lastDocCodes[termID]);
writeVInt(0, postings.termFreqs[termID]);
}

// Init freq for the current document
postings.termFreqs[termID] = getTermFreq();
fieldState.maxTermFrequency =
Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);
postings.lastDocCodes[termID] = (docID - postings.lastDocIDs[termID]) << 1;
postings.lastDocIDs[termID] = docID;
if (hasProx) {
writeProx(termID, fieldState.position);
if (hasOffsets) {
postings.lastOffsets[termID] = 0;
writeOffsets(termID, fieldState.offset);
}
} else {
assert !hasOffsets;
}
fieldState.uniqueTermCount++;
} else {
postings.termFreqs[termID] = Math.addExact(postings.termFreqs[termID], getTermFreq());
fieldState.maxTermFrequency =
Math.max(fieldState.maxTermFrequency, postings.termFreqs[termID]);
if (hasProx) {
writeProx(termID, fieldState.position - postings.lastPositions[termID]);
if (hasOffsets) {
writeOffsets(termID, fieldState.offset);
}
}
}
}

生成termId

堆栈

main[1] where
[1] org.apache.lucene.index.TermsHashPerField.initStreamSlices (TermsHashPerField.java:150)
[2] org.apache.lucene.index.TermsHashPerField.add (TermsHashPerField.java:198)
[3] org.apache.lucene.index.IndexingChain$PerField.invert (IndexingChain.java:1,224)
[4] org.apache.lucene.index.IndexingChain.processField (IndexingChain.java:729)
[5] org.apache.lucene.index.IndexingChain.processDocument (IndexingChain.java:620)
[6] org.apache.lucene.index.DocumentsWriterPerThread.updateDocuments (DocumentsWriterPerThread.java:241)
[7] org.apache.lucene.index.DocumentsWriter.updateDocuments (DocumentsWriter.java:432)
[8] org.apache.lucene.index.IndexWriter.updateDocuments (IndexWriter.java:1,531)
[9] org.apache.lucene.index.IndexWriter.updateDocument (IndexWriter.java:1,816)
[10] org.apache.lucene.index.IndexWriter.addDocument (IndexWriter.java:1,469)
[11] org.apache.lucene.demo.IndexFiles.indexDoc (IndexFiles.java:271)
[12] org.apache.lucene.demo.IndexFiles$1.visitFile (IndexFiles.java:212)
[13] org.apache.lucene.demo.IndexFiles$1.visitFile (IndexFiles.java:208)
[14] java.nio.file.Files.walkFileTree (Files.java:2,811)
[15] java.nio.file.Files.walkFileTree (Files.java:2,882)
[16] org.apache.lucene.demo.IndexFiles.indexDocs (IndexFiles.java:206)
[17] org.apache.lucene.demo.IndexFiles.main (IndexFiles.java:157)

      IntBlockPool intPool,
ByteBlockPool bytePool,
ByteBlockPool termBytePool,

首先介绍intPool这个变量这个变量维护了一个二维数组int buffers[][]和三个偏移量来保存bytePool的偏移量。

public final class IntBlockPool {
...

// 类初始化是10 , 后面会自动扩容,核心结构 , 这个二维数组存的是bytePool 偏移量,默认初始化容量是10
public int[][] buffers = new int[10][];

// 二维数组偏移量,也就是联合buffers使用 。一般这样用 buffers[bufferUpto+offset]
private int bufferUpto = -1;
// 二维数组中的一维数组 , 描述的是最新写入的buffers
// 举例 buffer = buffers[1];
public int[] buffer;
//intUpto 描述的是相对于一维数组的偏移
public int intUpto = INT_BLOCK_SIZE;
// 绝对偏移 ,相对于二维数组的偏移 ,有点像计算机里面的相对跳转和绝对跳转
public int intOffset = -INT_BLOCK_SIZE;
}

然后和intPool一样,bytePooltermBytePool 也是用几个变量加一个二维数组描述

public final class ByteBlockPool implements Accountable {
...
// 核心结构,一个二维数组
public byte[][] buffers = new byte[10][];

/** index into the buffers array pointing to the current buffer used as the head */
private int bufferUpto = -1; // Which buffer we are upto
/** Where we are in head buffer */
public int byteUpto = BYTE_BLOCK_SIZE;

/** Current head buffer */
public byte[] buffer;
/** Current head offset */
public int byteOffset = -BYTE_BLOCK_SIZE;

查询搜索

断点

## 断点
java -agentlib:jdwp=transport=dt_socket,server=y,address=8000 -cp /home/dai/lucene/lucene/demo/build/libs/lucene-demo-10.0.0-SNAPSHOT.jar:/home/dai/lucene/lucene/core/build/libs/lucene-core-10.0.0-SNAPSHOT.jar:/home/dai/lucene/lucene/queryparser/build/libs/lucene-queryparser-10.0.0-SNAPSHOT.jar org.apache.lucene.demo.SearchFiles

## jdb 调试
jdb -attach 8000 -sourcepath /home/dai/lucene/lucene/demo/src/java/:/home/dai/lucene/lucene/core/src/java/

termState描述的是term的统计信息

ain[1] print termState
termState = "TermStates
state=docFreq=1 totalTermFreq=1 termBlockOrd=2 blockFP=0 docStartFP=63 posStartFP=63 payStartFP=0 lastPosBlockOffset=-1 singletonDocID=6
"
main[1] print term
term = "contents:am"
main[1] where
[1] org.apache.lucene.search.TermQuery.createWeight (TermQuery.java:233)
[2] org.apache.lucene.search.IndexSearcher.createWeight (IndexSearcher.java:894)
[3] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:686)
[4] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
[5] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
[6] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
[7] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

排序

默认排序是BM25Similarity

main[1] where
[1] org.apache.lucene.search.similarities.BM25Similarity.scorer (BM25Similarity.java:200)
[2] org.apache.lucene.search.TermQuery$TermWeight.<init> (TermQuery.java:75)
[3] org.apache.lucene.search.TermQuery.createWeight (TermQuery.java:233)
[4] org.apache.lucene.search.IndexSearcher.createWeight (IndexSearcher.java:894)
[5] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:686)
[6] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
[7] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
[8] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
[9] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

核心搜索参数

main[1] list
763 // there is no doc of interest in this reader context
764 // continue with the following leaf
765 continue;
766 }
767 => BulkScorer scorer = weight.bulkScorer(ctx);
768 if (scorer != null) {
769 try {
770 scorer.score(leafCollector, ctx.reader().getLiveDocs());
771 } catch (
772 @SuppressWarnings("unused")
main[1] where
[1] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:767)
[2] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:693)
[3] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
[4] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
[5] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
[6] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
[7] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

获取reader

Step completed: "thread=main", org.apache.lucene.index.LeafReaderContext.reader(), line=67 bci=0
67 return reader;

main[1] print reader
reader = "_0(10.0.0):c7:[diagnostics={source=flush, os.arch=amd64, java.runtime.version=17.0.3+7-Ubuntu-0ubuntu0.22.04.1, os.version=5.15.0-33-generic, java.vendor=Private Build, os=Linux, timestamp=1656601918836, java.version=17.0.3, java.vm.version=17.0.3+7-Ubuntu-0ubuntu0.22.04.1, lucene.version=10.0.0}]:[attributes={Lucene90StoredFieldsFormat.mode=BEST_SPEED}] :id=c276i3vlaza4c6uumuxapfnvf"
main[1] where
[1] org.apache.lucene.index.LeafReaderContext.reader (LeafReaderContext.java:67)
[2] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:770)
[3] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:693)
[4] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
[5] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
[6] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
[7] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
[8] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

其中的reader 对象

main[1] dump reader
reader = {
si: instance of org.apache.lucene.index.SegmentCommitInfo(id=1531)
originalSi: instance of org.apache.lucene.index.SegmentCommitInfo(id=1532)
metaData: instance of org.apache.lucene.index.LeafMetaData(id=1533)
liveDocs: null
hardLiveDocs: null
numDocs: 7
core: instance of org.apache.lucene.index.SegmentCoreReaders(id=1534)
segDocValues: instance of org.apache.lucene.index.SegmentDocValues(id=1535)
isNRT: false
docValuesProducer: null
fieldInfos: instance of org.apache.lucene.index.FieldInfos(id=1536)
readerClosedListeners: instance of java.util.concurrent.CopyOnWriteArraySet(id=1537)
readerCacheHelper: instance of org.apache.lucene.index.SegmentReader$1(id=1538)
coreCacheHelper: instance of org.apache.lucene.index.SegmentReader$2(id=1539)
$assertionsDisabled: true
org.apache.lucene.index.LeafReader.readerContext: instance of org.apache.lucene.index.LeafReaderContext(id=1540)
org.apache.lucene.index.LeafReader.$assertionsDisabled: true
org.apache.lucene.index.IndexReader.closed: false
org.apache.lucene.index.IndexReader.closedByChild: false
org.apache.lucene.index.IndexReader.refCount: instance of java.util.concurrent.atomic.AtomicInteger(id=1541)
org.apache.lucene.index.IndexReader.parentReaders: instance of java.util.Collections$SynchronizedSet(id=1542)
}

排序:

main[1] list
222
223 @Override
224 public int score(LeafCollector collector, Bits acceptDocs, int min, int max)
225 throws IOException {
226 => collector.setScorer(scorer);
227 DocIdSetIterator scorerIterator = twoPhase == null ? iterator : twoPhase.approximation();
228 DocIdSetIterator competitiveIterator = collector.competitiveIterator();
229 DocIdSetIterator filteredIterator;
230 if (competitiveIterator == null) {
231 filteredIterator = scorerIterator;
main[1] where
[1] org.apache.lucene.search.Weight$DefaultBulkScorer.score (Weight.java:226)
[2] org.apache.lucene.search.BulkScorer.score (BulkScorer.java:38)
[3] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:770)
[4] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:693)
[5] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
[6] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
[7] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
[8] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
[9] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

排序

  private static class SimpleTopScoreDocCollector extends TopScoreDocCollector {

...

@Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
...
return new ScorerLeafCollector() {
...
@Override
public void collect(int doc) throws IOException {
float score = scorer.score(); <---- 这里不用传docId 就能获取score ,是因为可以从父类TopScoreDocCollector 获取docId

// This collector relies on the fact that scorers produce positive values:
assert score >= 0; // NOTE: false for NaN

totalHits++;
hitsThresholdChecker.incrementHitCount();

if (minScoreAcc != null && (totalHits & minScoreAcc.modInterval) == 0) {
updateGlobalMinCompetitiveScore(scorer);
}

if (score <= pqTop.score) {
if (totalHitsRelation == TotalHits.Relation.EQUAL_TO) {
// we just reached totalHitsThreshold, we can start setting the min
// competitive score now
updateMinCompetitiveScore(scorer);
}
// Since docs are returned in-order (i.e., increasing doc Id), a document
// with equal score to pqTop.score cannot compete since HitQueue favors
// documents with lower doc Ids. Therefore reject those docs too.
return;
}
pqTop.doc = doc + docBase;
pqTop.score = score;
pqTop = pq.updateTop();
updateMinCompetitiveScore(scorer);
}
};
}
main[1] print scorer
scorer = "scorer(weight(contents:am))[org.apache.lucene.search.TermScorer@290dbf45]"
main[1] dump scorer
scorer = {
postingsEnum: instance of org.apache.lucene.index.SlowImpactsEnum(id=1546)
impactsEnum: instance of org.apache.lucene.index.SlowImpactsEnum(id=1546)
iterator: instance of org.apache.lucene.search.ImpactsDISI(id=1547)
docScorer: instance of org.apache.lucene.search.LeafSimScorer(id=1548)
impactsDisi: instance of org.apache.lucene.search.ImpactsDISI(id=1547)
$assertionsDisabled: true
org.apache.lucene.search.Scorer.weight: instance of org.apache.lucene.search.TermQuery$TermWeight(id=1549)
}
main[1] where
[1] org.apache.lucene.search.TopScoreDocCollector$SimpleTopScoreDocCollector$1.collect (TopScoreDocCollector.java:76) <--- 这里没有传doc_id 进去scorer 是因为有个回调, 可以获取doc_id , 这里会有歌pq,是一个排序好的doc
[2] org.apache.lucene.search.Weight$DefaultBulkScorer.scoreAll (Weight.java:305)
[3] org.apache.lucene.search.Weight$DefaultBulkScorer.score (Weight.java:247)
[4] org.apache.lucene.search.BulkScorer.score (BulkScorer.java:38)
[5] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:770)
[6] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:693)
[7] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
[8] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
[9] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
[10] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
[11] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)
main[1]

核心算分函数

排序算分

main[1] list
246 // float. And then monotonicity is preserved through composition via
247 // x -> 1 + x and x -> 1 - 1/x.
248 // Finally we expand weight * (1 - 1 / (1 + freq * 1/norm)) to
249 // weight - weight / (1 + freq * 1/norm), which runs slightly faster.
250 => float normInverse = cache[((byte) encodedNorm) & 0xFF];
251 return weight - weight / (1f + freq * normInverse);
252 }
253
254 @Override
255 public Explanation explain(Explanation freq, long encodedNorm) {
main[1] where
[1] org.apache.lucene.search.similarities.BM25Similarity$BM25Scorer.score (BM25Similarity.java:250)
[2] org.apache.lucene.search.LeafSimScorer.score (LeafSimScorer.java:60)
[3] org.apache.lucene.search.TermScorer.score (TermScorer.java:75)
[4] org.apache.lucene.search.TopScoreDocCollector$SimpleTopScoreDocCollector$1.collect (TopScoreDocCollector.java:73)
[5] org.apache.lucene.search.Weight$DefaultBulkScorer.scoreAll (Weight.java:305)
[6] org.apache.lucene.search.Weight$DefaultBulkScorer.score (Weight.java:247)
[7] org.apache.lucene.search.BulkScorer.score (BulkScorer.java:38)
[8] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:770)
[9] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:693)
[10] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
[11] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
[12] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
[13] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
[14] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

reduce 过程

main[1] list
60 * Populates the results array with the ScoreDoc instances. This can be overridden in case a
61 * different ScoreDoc type should be returned.
62 */
63 protected void populateResults(ScoreDoc[] results, int howMany) {
64 => for (int i = howMany - 1; i >= 0; i--) {
65 results[i] = pq.pop();
66 }
67 }
68
69 /**
main[1] where
[1] org.apache.lucene.search.TopDocsCollector.populateResults (TopDocsCollector.java:64)
[2] org.apache.lucene.search.TopDocsCollector.topDocs (TopDocsCollector.java:166)
[3] org.apache.lucene.search.TopDocsCollector.topDocs (TopDocsCollector.java:98)
[4] org.apache.lucene.search.IndexSearcher$2.reduce (IndexSearcher.java:526)
[5] org.apache.lucene.search.IndexSearcher$2.reduce (IndexSearcher.java:505)
[6] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:694)
[7] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
[8] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
[9] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
[10] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
[11] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

辅助函数,获取topk的数据内容

堆栈:

main[1] where
[1] org.apache.lucene.search.TopDocs.mergeAux (TopDocs.java:312)
[2] org.apache.lucene.search.TopDocs.merge (TopDocs.java:216)
[3] org.apache.lucene.search.IndexSearcher$2.reduce (IndexSearcher.java:528)
[4] org.apache.lucene.search.IndexSearcher$2.reduce (IndexSearcher.java:505)
[5] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:694)
[6] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
[7] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
[8] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
[9] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
[10] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

  /**
* Auxiliary method used by the {@link #merge} impls. A sort value of null is used to indicate
* that docs should be sorted by score.
*/
private static TopDocs mergeAux(
Sort sort, int start, int size, TopDocs[] shardHits, Comparator<ScoreDoc> tieBreaker) {

final PriorityQueue<ShardRef> queue;
if (sort == null) {
queue = new ScoreMergeSortQueue(shardHits, tieBreaker);
} else {
queue = new MergeSortQueue(sort, shardHits, tieBreaker);
}

long totalHitCount = 0;
TotalHits.Relation totalHitsRelation = TotalHits.Relation.EQUAL_TO;
int availHitCount = 0;
for (int shardIDX = 0; shardIDX < shardHits.length; shardIDX++) {
final TopDocs shard = shardHits[shardIDX];
// totalHits can be non-zero even if no hits were
// collected, when searchAfter was used:
totalHitCount += shard.totalHits.value;
// If any hit count is a lower bound then the merged
// total hit count is a lower bound as well
if (shard.totalHits.relation == TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO) {
totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
}
if (shard.scoreDocs != null && shard.scoreDocs.length > 0) {
availHitCount += shard.scoreDocs.length;
queue.add(new ShardRef(shardIDX));
}
}

final ScoreDoc[] hits;
boolean unsetShardIndex = false;
if (availHitCount <= start) {
hits = new ScoreDoc[0];
} else {
hits = new ScoreDoc[Math.min(size, availHitCount - start)];
int requestedResultWindow = start + size;
int numIterOnHits = Math.min(availHitCount, requestedResultWindow);
int hitUpto = 0;
while (hitUpto < numIterOnHits) {
assert queue.size() > 0;
ShardRef ref = queue.top();
final ScoreDoc hit = shardHits[ref.shardIndex].scoreDocs[ref.hitIndex++];

// Irrespective of whether we use shard indices for tie breaking or not, we check for
// consistent
// order in shard indices to defend against potential bugs
if (hitUpto > 0) {
if (unsetShardIndex != (hit.shardIndex == -1)) {
throw new IllegalArgumentException("Inconsistent order of shard indices");
}
}

unsetShardIndex |= hit.shardIndex == -1;

if (hitUpto >= start) {
hits[hitUpto - start] = hit;
}

hitUpto++;

if (ref.hitIndex < shardHits[ref.shardIndex].scoreDocs.length) {
// Not done with this these TopDocs yet:
queue.updateTop();
} else {
queue.pop();
}
}
}

TotalHits totalHits = new TotalHits(totalHitCount, totalHitsRelation);
if (sort == null) {
return new TopDocs(totalHits, hits);
} else {
return new TopFieldDocs(totalHits, hits, sort.getSort());
}
}

通过docid 获取对应的文档

        fieldsStream.seek(startPointer);
decompressor.decompress(fieldsStream, totalLength, offset, length, bytes);
assert bytes.length == length;
documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length);

堆栈:

main[1] where
[1] org.apache.lucene.store.ByteBufferIndexInput$SingleBufferImpl.seek (ByteBufferIndexInput.java:576)
[2] org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader$BlockState.document (Lucene90CompressingStoredFieldsReader.java:594)
[3] org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader.document (Lucene90CompressingStoredFieldsReader.java:610)
[4] org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader.visitDocument (Lucene90CompressingStoredFieldsReader.java:628)
[5] org.apache.lucene.index.CodecReader.document (CodecReader.java:89)
[6] org.apache.lucene.index.BaseCompositeReader.document (BaseCompositeReader.java:154)
[7] org.apache.lucene.index.IndexReader.document (IndexReader.java:380)
[8] org.apache.lucene.search.IndexSearcher.doc (IndexSearcher.java:380)
[9] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:214)
[10] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)


mmap加载文件到内存:

Breakpoint hit: "thread=main", org.apache.lucene.store.ByteBufferIndexInput.setCurBuf(), line=86 bci=0
86 this.curBuf = curBuf;

main[1] where
[1] org.apache.lucene.store.ByteBufferIndexInput.setCurBuf (ByteBufferIndexInput.java:86)
[2] org.apache.lucene.store.ByteBufferIndexInput$SingleBufferImpl.<init> (ByteBufferIndexInput.java:556)
[3] org.apache.lucene.store.ByteBufferIndexInput.newInstance (ByteBufferIndexInput.java:63)
[4] org.apache.lucene.store.MMapDirectory.openInput (MMapDirectory.java:238)
[5] org.apache.lucene.store.Directory.openChecksumInput (Directory.java:152)
[6] org.apache.lucene.index.SegmentInfos.readCommit (SegmentInfos.java:290)
[7] org.apache.lucene.index.StandardDirectoryReader$1.doBody (StandardDirectoryReader.java:88)
[8] org.apache.lucene.index.StandardDirectoryReader$1.doBody (StandardDirectoryReader.java:77)
[9] org.apache.lucene.index.SegmentInfos$FindSegmentsFile.run (SegmentInfos.java:798)
[10] org.apache.lucene.index.StandardDirectoryReader.open (StandardDirectoryReader.java:109)
[11] org.apache.lucene.index.StandardDirectoryReader.open (StandardDirectoryReader.java:67)
[12] org.apache.lucene.index.DirectoryReader.open (DirectoryReader.java:60)
[13] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:105)

很明显,打开文件是在org.apache.lucene.store.MMapDirectory.openInput 这个类实现就是打开文件。

先打开文件segments_1

main[1] print name
name = "segments_1"
main[1] list
228
229 /** Creates an IndexInput for the file with the given name. */
230 @Override
231 public IndexInput openInput(String name, IOContext context) throws IOException {
232 => ensureOpen();
233 ensureCanRead(name);
234 Path path = directory.resolve(name);
235 try (FileChannel c = FileChannel.open(path, StandardOpenOption.READ)) {
236 final String resourceDescription = "MMapIndexInput(path=\"" + path.toString() + "\")";
237 final boolean useUnmap = getUseUnmap();
main[1]

举例读取字符串:

  private static void readField(DataInput in, StoredFieldVisitor visitor, FieldInfo info, int bits)
throws IOException {
switch (bits & TYPE_MASK) {
case BYTE_ARR:
int length = in.readVInt();
byte[] data = new byte[length];
in.readBytes(data, 0, length);
visitor.binaryField(info, data);
break;
case STRING:
visitor.stringField(info, in.readString());
break;
case NUMERIC_INT:
visitor.intField(info, in.readZInt());
break;
case NUMERIC_FLOAT:
visitor.floatField(info, readZFloat(in));
break;
case NUMERIC_LONG:
visitor.longField(info, readTLong(in));
break;
case NUMERIC_DOUBLE:
visitor.doubleField(info, readZDouble(in));
break;
default:
throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits));
}
}
main[1] where
[1] org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader.readField (Lucene90CompressingStoredFieldsReader.java:246)
[2] org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader.visitDocument (Lucene90CompressingStoredFieldsReader.java:640)
[3] org.apache.lucene.index.CodecReader.document (CodecReader.java:89)
[4] org.apache.lucene.index.BaseCompositeReader.document (BaseCompositeReader.java:154)
[5] org.apache.lucene.index.IndexReader.document (IndexReader.java:380)
[6] org.apache.lucene.search.IndexSearcher.doc (IndexSearcher.java:380)
[7] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:214)
[8] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)
main[1]

main[1] list
66 }
67
68 @Override
69 public void stringField(FieldInfo fieldInfo, String value) throws IOException {
70 => final FieldType ft = new FieldType(TextField.TYPE_STORED);
71 ft.setStoreTermVectors(fieldInfo.hasVectors());
72 ft.setOmitNorms(fieldInfo.omitsNorms());
73 ft.setIndexOptions(fieldInfo.getIndexOptions());
74 doc.add(
75 new StoredField(
main[1] print value
value = "/home/dai/docs/aaa.txt"
main[1] where
[1] org.apache.lucene.document.DocumentStoredFieldVisitor.stringField (DocumentStoredFieldVisitor.java:70)
[2] org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader.readField (Lucene90CompressingStoredFieldsReader.java:246)
[3] org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsReader.visitDocument (Lucene90CompressingStoredFieldsReader.java:640)
[4] org.apache.lucene.index.CodecReader.document (CodecReader.java:89)
[5] org.apache.lucene.index.BaseCompositeReader.document (BaseCompositeReader.java:154)
[6] org.apache.lucene.index.IndexReader.document (IndexReader.java:380)
[7] org.apache.lucene.search.IndexSearcher.doc (IndexSearcher.java:380)
[8] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:214)
[9] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

将读到的string 加载到doc对象里面

这是核心函数 , mmap 读取文件,然后seek 算出偏移和长度 ,从文件中读取出来并构造成对象


/**
* Get the serialized representation of the given docID. This docID has to be contained in the
* current block.
*/
SerializedDocument document(int docID) throws IOException {
if (contains(docID) == false) {
throw new IllegalArgumentException();
}

final int index = docID - docBase;
final int offset = Math.toIntExact(offsets[index]);
final int length = Math.toIntExact(offsets[index + 1]) - offset;
final int totalLength = Math.toIntExact(offsets[chunkDocs]);
final int numStoredFields = Math.toIntExact(this.numStoredFields[index]);

final BytesRef bytes;
if (merging) {
bytes = this.bytes;
} else {
bytes = new BytesRef();
}
...
fieldsStream.seek(startPointer); // 计算偏移量
decompressor.decompress(fieldsStream, totalLength, offset, length, bytes); // 解压内容
assert bytes.length == length;
documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length); // 将内容塞到对象里面
}

return new SerializedDocument(documentInput, length, numStoredFields);
}
}

获取doc

Breakpoint hit: "thread=main", org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockDocsEnum.advance(), line=498 bci=0
498 if (docFreq > BLOCK_SIZE && target > nextSkipDoc) {

main[1] where
[1] org.apache.lucene.codecs.lucene90.Lucene90PostingsReader$BlockDocsEnum.advance (Lucene90PostingsReader.java:498)
[2] org.apache.lucene.index.SlowImpactsEnum.advance (SlowImpactsEnum.java:77)
[3] org.apache.lucene.search.ImpactsDISI.advance (ImpactsDISI.java:128)
[4] org.apache.lucene.search.ImpactsDISI.nextDoc (ImpactsDISI.java:133)
[5] org.apache.lucene.search.Weight$DefaultBulkScorer.scoreAll (Weight.java:301)
[6] org.apache.lucene.search.Weight$DefaultBulkScorer.score (Weight.java:247)
[7] org.apache.lucene.search.BulkScorer.score (BulkScorer.java:38)
[8] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:770)
[9] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:693)
[10] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
[11] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
[12] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
[13] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
[14] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

term query 和遍历

注意到 ImpactsEnum 实现了iteratorDocId

1,138      }
1,139
1,140 @Override
1,141 public ImpactsEnum impacts(int flags) throws IOException {
1,142 => assert !eof;
1,143 // if (DEBUG) {
1,144 // System.out.println("BTTR.docs seg=" + segment);
1,145 // }
1,146 currentFrame.decodeMetaData();
1,147 // if (DEBUG) {
main[1] where
[1] org.apache.lucene.codecs.lucene90.blocktree.SegmentTermsEnum.impacts (SegmentTermsEnum.java:1,142)
[2] org.apache.lucene.search.TermQuery$TermWeight.scorer (TermQuery.java:114)
[3] org.apache.lucene.search.Weight.bulkScorer (Weight.java:166)
[4] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:767)
[5] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:693)
[6] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
[7] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
[8] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
[9] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
[10] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)

注意到PostingsEnum 也有docidIterater

排序topk


main[1] where
[1] org.apache.lucene.util.PriorityQueue.upHeap (PriorityQueue.java:276)
[2] org.apache.lucene.util.PriorityQueue.add (PriorityQueue.java:161)
[3] org.apache.lucene.search.TopDocs.mergeAux (TopDocs.java:303)
[4] org.apache.lucene.search.TopDocs.merge (TopDocs.java:216)
[5] org.apache.lucene.search.IndexSearcher$2.reduce (IndexSearcher.java:528)
[6] org.apache.lucene.search.IndexSearcher$2.reduce (IndexSearcher.java:505)
[7] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:694)
[8] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:687)
[9] org.apache.lucene.search.IndexSearcher.searchAfter (IndexSearcher.java:532)
[10] org.apache.lucene.search.IndexSearcher.search (IndexSearcher.java:542)
[11] org.apache.lucene.demo.SearchFiles.doPagingSearch (SearchFiles.java:180)
[12] org.apache.lucene.demo.SearchFiles.main (SearchFiles.java:150)


@Override
public boolean lessThan(ShardRef first, ShardRef second) {
assert first != second;
ScoreDoc firstScoreDoc = shardHits[first.shardIndex][first.hitIndex];
ScoreDoc secondScoreDoc = shardHits[second.shardIndex][second.hitIndex];
if (firstScoreDoc.score < secondScoreDoc.score) {
return false;
} else if (firstScoreDoc.score > secondScoreDoc.score) {
return true;
} else {
return tieBreakLessThan(first, firstScoreDoc, second, secondScoreDoc, tieBreakerComparator);
}

相关阅读