Skip to main content

fst 结构

· 6 min read

背景

了解lucene 的fst结构

核心函数

freeezeTail -> compileNode

  private void freezeTail(int prefixLenPlus1) throws IOException {  // 入参是一个偏移值:  公共前缀+ 1
final int downTo = Math.max(1, prefixLenPlus1);
for (int idx = lastInput.length(); idx >= downTo; idx--) {

boolean doPrune = false;
boolean doCompile = false;


if (doCompile) {
...
parent.replaceLast(
lastInput.intAt(idx - 1),
compileNode(node, 1 + lastInput.length() - idx),
nextFinalOutput,
isFinal);
...
}
}
}
}
  private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throws IOException {
final long node;
long bytesPosStart = bytes.getPosition();
if (dedupHash != null
&& (doShareNonSingletonNodes || nodeIn.numArcs <= 1)
&& tailLength <= shareMaxTailLength) {
if (nodeIn.numArcs == 0) {
node = fst.addNode(this, nodeIn);
lastFrozenNode = node;
} else {
node = dedupHash.add(this, nodeIn);
}
} else {
node = fst.addNode(this, nodeIn);
}
assert node != -2;

long bytesPosEnd = bytes.getPosition();
if (bytesPosEnd != bytesPosStart) {
// The FST added a new node:
assert bytesPosEnd > bytesPosStart;
lastFrozenNode = node;
}

nodeIn.clear();

final CompiledNode fn = new CompiledNode();
fn.node = node;
return fn;
}
  // serializes new node by appending its bytes to the end
// of the current byte[]
long addNode(FSTCompiler<T> fstCompiler, FSTCompiler.UnCompiledNode<T> nodeIn)
throws IOException {
T NO_OUTPUT = outputs.getNoOutput();

// System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs);
if (nodeIn.numArcs == 0) {
if (nodeIn.isFinal) {
return FINAL_END_NODE;
} else {
return NON_FINAL_END_NODE;
}
}
final long startAddress = fstCompiler.bytes.getPosition();
// System.out.println(" startAddr=" + startAddress);

final boolean doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(fstCompiler, nodeIn);
if (doFixedLengthArcs) {
// System.out.println(" fixed length arcs");
if (fstCompiler.numBytesPerArc.length < nodeIn.numArcs) {
fstCompiler.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, Integer.BYTES)];
fstCompiler.numLabelBytesPerArc = new int[fstCompiler.numBytesPerArc.length];
}
}

fstCompiler.arcCount += nodeIn.numArcs;

final int lastArc = nodeIn.numArcs - 1;

long lastArcStart = fstCompiler.bytes.getPosition();
int maxBytesPerArc = 0;
int maxBytesPerArcWithoutLabel = 0;
for (int arcIdx = 0; arcIdx < nodeIn.numArcs; arcIdx++) {
final FSTCompiler.Arc<T> arc = nodeIn.arcs[arcIdx];
final FSTCompiler.CompiledNode target = (FSTCompiler.CompiledNode) arc.target;
int flags = 0;
// System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" +
// target.node);

if (arcIdx == lastArc) {
flags += BIT_LAST_ARC;
}

if (fstCompiler.lastFrozenNode == target.node && !doFixedLengthArcs) {
// TODO: for better perf (but more RAM used) we
// could avoid this except when arc is "near" the
// last arc:
flags += BIT_TARGET_NEXT;
}

if (arc.isFinal) {
flags += BIT_FINAL_ARC;
if (arc.nextFinalOutput != NO_OUTPUT) {
flags += BIT_ARC_HAS_FINAL_OUTPUT;
}
} else {
assert arc.nextFinalOutput == NO_OUTPUT;
}

boolean targetHasArcs = target.node > 0;

if (!targetHasArcs) {
flags += BIT_STOP_NODE;
}

if (arc.output != NO_OUTPUT) {
flags += BIT_ARC_HAS_OUTPUT;
}

fstCompiler.bytes.writeByte((byte) flags);
long labelStart = fstCompiler.bytes.getPosition();
writeLabel(fstCompiler.bytes, arc.label);
int numLabelBytes = (int) (fstCompiler.bytes.getPosition() - labelStart);

// System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + "
// target=" + target.node + " pos=" + bytes.getPosition() + " output=" +
// outputs.outputToString(arc.output));

if (arc.output != NO_OUTPUT) {
outputs.write(arc.output, fstCompiler.bytes);
// System.out.println(" write output");
}

if (arc.nextFinalOutput != NO_OUTPUT) {
// System.out.println(" write final output");
outputs.writeFinalOutput(arc.nextFinalOutput, fstCompiler.bytes);
}

if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
assert target.node > 0;
// System.out.println(" write target");
fstCompiler.bytes.writeVLong(target.node);
}

// just write the arcs "like normal" on first pass, but record how many bytes each one took
// and max byte size:
if (doFixedLengthArcs) {
int numArcBytes = (int) (fstCompiler.bytes.getPosition() - lastArcStart);
fstCompiler.numBytesPerArc[arcIdx] = numArcBytes;
fstCompiler.numLabelBytesPerArc[arcIdx] = numLabelBytes;
lastArcStart = fstCompiler.bytes.getPosition();
maxBytesPerArc = Math.max(maxBytesPerArc, numArcBytes);
maxBytesPerArcWithoutLabel =
Math.max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes);
// System.out.println(" arcBytes=" + numArcBytes + " labelBytes=" + numLabelBytes);
}
}

// TODO: try to avoid wasteful cases: disable doFixedLengthArcs in that case
/*
*
* LUCENE-4682: what is a fair heuristic here?
* It could involve some of these:
* 1. how "busy" the node is: nodeIn.inputCount relative to frontier[0].inputCount?
* 2. how much binSearch saves over scan: nodeIn.numArcs
* 3. waste: numBytes vs numBytesExpanded
*
* the one below just looks at #3
if (doFixedLengthArcs) {
// rough heuristic: make this 1.25 "waste factor" a parameter to the phd ctor????
int numBytes = lastArcStart - startAddress;
int numBytesExpanded = maxBytesPerArc * nodeIn.numArcs;
if (numBytesExpanded > numBytes*1.25) {
doFixedLengthArcs = false;
}
}
*/

if (doFixedLengthArcs) {
assert maxBytesPerArc > 0;
// 2nd pass just "expands" all arcs to take up a fixed byte size

int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
assert labelRange > 0;
if (shouldExpandNodeWithDirectAddressing(
fstCompiler, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
writeNodeForDirectAddressing(
fstCompiler, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
fstCompiler.directAddressingNodeCount++;
} else {
writeNodeForBinarySearch(fstCompiler, nodeIn, startAddress, maxBytesPerArc);
fstCompiler.binarySearchNodeCount++;
}
}

final long thisNodeAddress = fstCompiler.bytes.getPosition() - 1;
fstCompiler.bytes.reverse(startAddress, thisNodeAddress);
fstCompiler.nodeCount++;
return thisNodeAddress;
}

Arc<T> 描述的是一个弧

//package org.apache.lucene.util.fst;
// org\apache\lucene\util\fst\FSTCompiler.java
/** Expert: holds a pending (seen but not yet serialized) arc. */
static class Arc<T> {
int label; // really an "unsigned" byte // 一个label
Node target; // 举例:a -> b 那么target 就是b
boolean isFinal;
T output;
T nextFinalOutput;
}

例子

cat的权重是5
dog的权重是7
dogs的权重是13

   String inputValues[] = {"cat", "dog", "dogs"};
long[] outputValues = {5, 7, 13};

下面是一个cat的例子 cat包含三个字符c,a ,t,分别代表三个ASCII码:

  • c:99
  • a:97
  • t:116

定义Arc: 一个弧包含三个内容: [a:label1]->[b:label2] 来描述一个弧:a指向b .其中a的值是label1, b的值是label2

下面是idea的截图, [2747:c] -> [2856:a] -> [2860:t]

fst linklist

下面是fst.bytes 的例子

fst bytes

最后是变成下列格式:

[0, 116, 15, 97, 6, 6, 115, 31, 103, 7, 111, 6, 7, 100, 22, 4, 5, 99, 16]

相关阅读