Skip to main content

lucene tim格式

· 5 min read

背景

tim文件是lucene 存储词相关统计信息的文件. 与它相关的还有tip文件

格式和例子

文件格式:

可以从最下面的相关阅读可以获取对应的文档

TermsDict (.tim) --> Header, PostingsHeader, NodeBlockNumBlocks, Footer
NodeBlock --> (OuterNode | InnerNode)
OuterNode --> EntryCount, SuffixLength, ByteSuffixLength, StatsLength, < TermStats >EntryCount, MetaLength, <TermMetadata>EntryCount
InnerNode --> EntryCount, SuffixLength[,Sub?], ByteSuffixLength, StatsLength, < TermStats ? >EntryCount, MetaLength, <TermMetadata ? >EntryCount
TermStats --> DocFreq, TotalTermFreq
Header --> CodecHeader
EntryCount,SuffixLength,StatsLength,DocFreq,MetaLength --> VInt
TotalTermFreq --> VLong
Footer --> CodecFooter

例子

hexdump -C  _j_Lucene90_0.tim 

00000000 3f d7 6c 17 12 42 6c 6f 63 6b 54 72 65 65 54 65 |?.l..BlockTreeTe|
00000010 72 6d 73 44 69 63 74 00 00 00 00 fe ea 80 e6 45 |rmsDict........E|
00000020 20 d8 56 64 1b 1b 1b 89 70 fe 67 0a 4c 75 63 65 | .Vd....p.g.Luce|
00000030 6e 65 39 30 5f 30 25 bc 03 61 6d 61 6e 64 62 75 |ne90_0%..amandbu|
00000040 74 63 61 6e 64 6f 68 65 6c 6c 6f 68 69 69 69 73 |tcandohellohiiis|
00000050 69 74 6b 6e 6f 77 6d 61 79 6d 6f 6e 67 6f 6e 6f |itknowmaymongono|
00000060 74 74 72 79 77 68 61 74 77 6f 72 6c 64 79 6f 75 |ttrywhatworldyou|
00000070 24 02 03 03 03 02 05 02 01 02 02 04 03 05 03 03 |$...............|
00000080 04 05 03 10 04 00 09 02 01 04 00 03 02 01 01 02 |................|
00000090 01 07 02 02 26 7a 3d 04 01 02 03 01 01 01 01 01 |....&z=.........| <--- 第六个字节 ,也就是7a开头
000000a0 05 01 01 01 00 02 04 00 02 01 01 01 01 01 02 01 |................|
000000b0 01 01 02 01 01 01 01 05 01 03 01 05 a4 03 2f 68 |............../h|
000000c0 6f 6d 65 2f 75 62 75 6e 74 75 2f 64 6f 63 2f 68 |ome/ubuntu/doc/h|
000000d0 65 6c 6c 6f 2e 74 78 74 2f 68 6f 6d 65 2f 75 62 |ello.txt/home/ub|
000000e0 75 6e 74 75 2f 64 6f 63 2f 6d 6f 6e 67 6f 2e 74 |untu/doc/mongo.t|
000000f0 78 74 05 1a 01 03 04 82 01 01 03 c0 28 93 e8 00 |xt..........(...|
00000100 00 00 00 00 00 00 00 da 02 a3 a3 |...........|

这里的ste.in 是tim文件的数据

main[2] list
472 }
473 }
474
475 // metadata
476 => ste.fr.parent.postingsReader.decodeTerm(bytesReader, ste.fr.fieldInfo, state, absolute);
477
478 metaDataUpto++;
479 absolute = false;
480 }
481 state.termBlockOrd = metaDataUpto;
main[2] print ste.in
ste.in = "MMapIndexInput(path="/home/ubuntu/index/_j_Lucene90_0.tim")"

这里的对应的是

main[2] dump bytesReader.bytes
bytesReader.bytes = {
122, 61, 4, 1, 2, 3, 1, 1, 1, 1, 1, 5, 1, 1, 1, 0, 2, 4, 0, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 5, 1, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
}

hexdump -C _j_Lucene90_0.tim

00000000 3f d7 6c 17 12 42 6c 6f 63 6b 54 72 65 65 54 65 |?.l..BlockTreeTe| 00000010 72 6d 73 44 69 63 74 00 00 00 00 fe ea 80 e6 45 |rmsDict........E| 00000020 20 d8 56 64 1b 1b 1b 89 70 fe 67 0a 4c 75 63 65 | .Vd....p.g.Luce| 00000030 6e 65 39 30 5f 30 25 bc 03 61 6d 61 6e 64 62 75 |ne90_0%..amandbu| 00000040 74 63 61 6e 64 6f 68 65 6c 6c 6f 68 69 69 69 73 |tcandohellohiiis| 00000050 69 74 6b 6e 6f 77 6d 61 79 6d 6f 6e 67 6f 6e 6f |itknowmaymongono| 00000060 74 74 72 79 77 68 61 74 77 6f 72 6c 64 79 6f 75 |ttrywhatworldyou| 00000070 24 02 03 03 03 02 05 02 01 02 02 04 03 05 03 03 |$...............| 00000080 04 05 03 10 04 00 09 02 01 04 00 03 02 01 01 02 |................| 00000090 01 07 02 02 26 7a 3d 04 01 02 03 01 01 01 01 01 |....&z=.........| 000000a0 05 01 01 01 00 02 04 00 02 01 01 01 01 01 02 01 |................| 000000b0 01 01 02 01 01 01 01 05 01 03 01 05 a4 03 2f 68 |............../h| 000000c0 6f 6d 65 2f 75 62 75 6e 74 75 2f 64 6f 63 2f 68 |ome/ubuntu/doc/h| 000000d0 65 6c 6c 6f 2e 74 78 74 2f 68 6f 6d 65 2f 75 62 |ello.txt/home/ub| 000000e0 75 6e 74 75 2f 64 6f 63 2f 6d 6f 6e 67 6f 2e 74 |untu/doc/mongo.t| 000000f0 78 74 05 1a 01 03 04 82 01 01 03 c0 28 93 e8 00 |xt..........(...| 00000100 00 00 00 00 00 00 00 da 02 a3 a3 |...........|

相关阅读