루씬 색인기에서 가장 핵심은 이곳인것 같다.
// Tokenizes the fields of a document into Postings.
private final void invertDocument(Document doc)
throws IOException {
Iterator fieldIterator = doc.getFields().iterator();
while (fieldIterator.hasNext()) {
Fieldable field = (Fieldable) fieldIterator.next();
String fieldName = field.name();
int fieldNumber = fieldInfos.fieldNumber(fieldName);
int length = fieldLengths[fieldNumber]; // length of field
int position = fieldPositions[fieldNumber]; // position in field
if (length>0) position+=analyzer.getPositionIncrementGap(fieldName);
int offset = fieldOffsets[fieldNumber]; // offset field
if (field.isIndexed()) {
if (!field.isTokenized()) { // un-tokenized field
String stringValue = field.stringValue();
if(field.isStoreOffsetWithTermVector())
addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
else
addPosition(fieldName, stringValue, position++, null);
offset += stringValue.length();
length++;
} else
{
Reader reader; // find or make Reader
if (field.readerValue() != null)
reader = field.readerValue();
else if (field.stringValue() != null)
reader = new StringReader(field.stringValue());
else
throw new IllegalArgumentException
("field must have either String or Reader value");
// Tokenize field and add to postingTable
TokenStream stream = analyzer.tokenStream(fieldName, reader);
try {
Token lastToken = null;
for (Token t = stream.next(); t != null; t = stream.next()) {
position += (t.getPositionIncrement() - 1);
if(field.isStoreOffsetWithTermVector())
addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
else
addPosition(fieldName, t.termText(), position++, null);
lastToken = t;
if (++length >= maxFieldLength) {
if (infoStream != null)
infoStream.println("maxFieldLength " +maxFieldLength+ " reached, ignoring following tokens");
break;
}
}
if(lastToken != null)
offset += lastToken.endOffset() + 1;
} finally {
stream.close();
}
}
fieldLengths[fieldNumber] = length; // save field length
fieldPositions[fieldNumber] = position; // save field position
fieldBoosts[fieldNumber] *= field.getBoost();
fieldOffsets[fieldNumber] = offset;
}
}
}
'IT-Consultant' 카테고리의 다른 글
Posting List 소팅시 quickSort 사용 (0) | 2008.10.29 |
---|---|
invertDocument(Tokenizes the fields of a document into Postings) (0) | 2008.10.29 |
최종적으로 만들어진 Posting List를 어떻게 파일에 쓸까? (0) | 2008.10.29 |
최종적으로 만들어진 Posting List를 어떻게 파일에 쓸까? (0) | 2008.10.29 |
Inverted Index Strategies (0) | 2008.10.29 |