본문 바로가기

IT-Consultant

invertDocument(Tokenizes the fields of a document into Postings)

루씬 색인기에서 가장 핵심은 이곳인것 같다.  

  // Tokenizes the fields of a document into Postings.
  private final void invertDocument(Document doc)
          throws IOException {
    Iterator fieldIterator = doc.getFields().iterator();
   
    while (fieldIterator.hasNext()) {
      Fieldable field = (Fieldable) fieldIterator.next();
      String fieldName = field.name();
      int fieldNumber = fieldInfos.fieldNumber(fieldName);
      int length = fieldLengths[fieldNumber];     // length of field
      int position = fieldPositions[fieldNumber]; // position in field
      if (length>0) position+=analyzer.getPositionIncrementGap(fieldName);
      int offset = fieldOffsets[fieldNumber];       // offset field

      if (field.isIndexed()) {
        if (!field.isTokenized()) {    // un-tokenized field
          String stringValue = field.stringValue();
          if(field.isStoreOffsetWithTermVector())
            addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
          else
            addPosition(fieldName, stringValue, position++, null);
          offset += stringValue.length();
          length++;
        } else
        {
          Reader reader;     // find or make Reader
          if (field.readerValue() != null)
            reader = field.readerValue();
          else if (field.stringValue() != null)
            reader = new StringReader(field.stringValue());
          else
            throw new IllegalArgumentException
                    ("field must have either String or Reader value");

          // Tokenize field and add to postingTable
          TokenStream stream = analyzer.tokenStream(fieldName, reader);
          try {
            Token lastToken = null;
            for (Token t = stream.next(); t != null; t = stream.next()) {
              position += (t.getPositionIncrement() - 1);
              if(field.isStoreOffsetWithTermVector())
                addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
              else
                addPosition(fieldName, t.termText(), position++, null);
             
              lastToken = t;
              if (++length >= maxFieldLength) {
                if (infoStream != null)
                  infoStream.println("maxFieldLength " +maxFieldLength+ " reached, ignoring following tokens");
                break;
              }
            }
           
            if(lastToken != null)
              offset += lastToken.endOffset() + 1;
           
          } finally {
            stream.close();
          }
        }

        fieldLengths[fieldNumber] = length;   // save field length
        fieldPositions[fieldNumber] = position;   // save field position
        fieldBoosts[fieldNumber] *= field.getBoost();
        fieldOffsets[fieldNumber] = offset;
      }
    }
  }