aios/storage/indexlib/document/normal/IndexDocument.cpp (391 lines of code) (raw):
/*
* Copyright 2014-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "indexlib/document/normal/IndexDocument.h"
#include "autil/StringUtil.h"
#include "indexlib/document/normal/IndexTokenizeField.h"
#include "indexlib/index/common/Constant.h"
#include "indexlib/util/ErrorLogCollector.h"
#include "indexlib/util/Exception.h"
using namespace autil;
using namespace autil::mem_pool;
using namespace std;
namespace indexlib::document {
AUTIL_LOG_SETUP(indexlib.document, IndexDocument);
IndexDocument::IndexDocument(Pool* pool)
: _fieldCount(0)
, _docId(INVALID_DOCID)
, _pool(pool)
, _payloads(_pool, HASH_MAP_INIT_ELEM_COUNT)
, _termPayloads(_pool, HASH_MAP_INIT_ELEM_COUNT)
{
}
IndexDocument::~IndexDocument()
{
size_t size = _fields.size();
for (size_t i = 0; i < size; i++) {
if (_fields[i]) {
IE_POOL_COMPATIBLE_DELETE_CLASS(_pool, _fields[i]);
_fields[i] = NULL;
}
}
_fields.clear();
_modifiedTokens.clear();
_termOriginValueMap.clear();
}
Field* IndexDocument::CreateField(fieldid_t fieldId, Field::FieldTag fieldTag)
{
assert(fieldId != INVALID_FIELDID);
if ((fieldid_t)_fields.size() > fieldId) {
if (NULL == _fields[fieldId]) {
_fields[fieldId] = CreateFieldByTag(_pool, fieldTag, false);
_fields[fieldId]->SetFieldId(fieldId);
++_fieldCount;
} else if (_fields[fieldId]->GetFieldId() == INVALID_FIELDID) {
++_fieldCount;
_fields[fieldId]->SetFieldId(fieldId);
}
return _fields[fieldId];
}
_fields.resize(fieldId + 1, NULL);
Field* pField = CreateFieldByTag(_pool, fieldTag, false);
pField->SetFieldId(fieldId);
_fields[fieldId] = pField;
++_fieldCount;
return pField;
}
void IndexDocument::ClearField(fieldid_t fieldId)
{
if ((fieldid_t)_fields.size() <= fieldId) {
return;
}
if (NULL == _fields[fieldId]) {
return;
}
if (_fields[fieldId]->GetFieldId() != INVALID_FIELDID) {
--_fieldCount;
}
IE_POOL_COMPATIBLE_DELETE_CLASS(_pool, _fields[fieldId]);
_fields[fieldId] = NULL;
}
void IndexDocument::ClearData()
{
size_t size = _fields.size();
for (size_t i = 0; i < size; i++) {
if (_fields[i]) {
IE_POOL_COMPATIBLE_DELETE_CLASS(_pool, _fields[i]);
_fields[i] = NULL;
}
}
_termOriginValueMap.clear();
_modifiedTokens.clear();
_fields.clear();
_fieldCount = 0;
_docId = INVALID_DOCID;
_sectionAttributeVec.clear();
_payloads.Clear();
_termPayloads.Clear();
_primaryKey.clear();
}
bool IndexDocument::AddField(Field* field)
{
assert(field && field->GetFieldId() != INVALID_FIELDID);
fieldid_t fieldId = field->GetFieldId();
if ((fieldid_t)_fields.size() <= fieldId) {
_fields.resize(fieldId + 1, NULL);
}
_fields[fieldId] = field;
++_fieldCount;
return true;
}
Field* IndexDocument::GetField(fieldid_t fieldId)
{
return const_cast<Field*>(static_cast<const IndexDocument&>(*this).GetField(fieldId));
}
const Field* IndexDocument::GetField(fieldid_t fieldId) const
{
if (fieldId >= (fieldid_t)_fields.size()) {
return NULL;
}
return _fields[fieldId];
}
void IndexDocument::SetField(fieldid_t fieldId, Field* field)
{
assert(fieldId != INVALID_FIELDID);
if (!field) {
return;
}
if ((fieldid_t)_fields.size() <= fieldId) {
_fields.resize(fieldId + 1, NULL);
}
if (!_fields[fieldId]) {
++_fieldCount;
} else {
IE_POOL_COMPATIBLE_DELETE_CLASS(_pool, _fields[fieldId]);
}
_fields[fieldId] = field;
field->SetFieldId(fieldId);
}
docpayload_t IndexDocument::GetDocPayload(const string& termText) const
{
docpayload_t ans = 0;
return _payloads.Find(HashAlgorithm::hashString64(termText.c_str()), ans);
}
docpayload_t IndexDocument::GetDocPayload(uint64_t intKey) const
{
docpayload_t ans = 0;
return _payloads.Find(intKey, ans);
}
termpayload_t IndexDocument::GetTermPayload(const string& termText) const
{
uint32_t ans = 0;
return _termPayloads.Find(HashAlgorithm::hashString64(termText.c_str()), ans);
}
termpayload_t IndexDocument::GetTermPayload(uint64_t intKey) const
{
uint32_t ans = 0;
return _termPayloads.Find(intKey, ans);
}
bool IndexDocument::operator==(const IndexDocument& doc) const
{
if (this == &doc)
return true;
if (_docId != doc._docId || _fieldCount != doc._fieldCount || _fields.size() != doc._fields.size() ||
_primaryKey != doc._primaryKey) {
return false;
}
for (uint32_t i = 0; i < _fields.size(); ++i) {
bool b1 = _fields[i] == NULL && doc._fields[i] == NULL;
bool b2 = _fields[i] != NULL && doc._fields[i] != NULL && *_fields[i] == *doc._fields[i];
if (b1 || b2)
continue;
return false;
}
return true;
}
void IndexDocument::SetTermPayload(const std::string& termText, termpayload_t payload)
{
_termPayloads.FindAndInsert(HashAlgorithm::hashString64(termText.c_str()), payload);
}
void IndexDocument::SetTermPayload(uint64_t intKey, termpayload_t payload)
{
_termPayloads.FindAndInsert(intKey, payload);
}
void IndexDocument::SetDocPayload(const std::string& termText, docpayload_t docPayload)
{
_payloads.FindAndInsert(HashAlgorithm::hashString64(termText.c_str()), docPayload);
}
void IndexDocument::SetDocPayload(uint64_t intKey, docpayload_t docPayload)
{
_payloads.FindAndInsert(intKey, docPayload);
}
docpayload_t IndexDocument::GetDocPayload(const config::PayloadConfig& payloadConfig, uint64_t intKey) const
{
intKey = payloadConfig.RewriteTermHash(intKey);
return GetDocPayload(intKey);
}
docpayload_t IndexDocument::GetDocPayload(const config::PayloadConfig& payloadConfig, const std::string& termText) const
{
uint64_t intKey = payloadConfig.RewriteTermHash(HashAlgorithm::hashString64(termText.c_str()));
return GetDocPayload(intKey);
}
void IndexDocument::SetDocPayload(const config::PayloadConfig& payloadConfig, const std::string& termText,
docpayload_t docPayload)
{
uint64_t intKey = payloadConfig.RewriteTermHash(HashAlgorithm::hashString64(termText.c_str()));
SetDocPayload(intKey, docPayload);
}
void IndexDocument::CreateSectionAttribute(indexid_t indexId, const string& attrStr)
{
StringView value = autil::MakeCString(attrStr, _pool);
SetSectionAttribute(indexId, value);
}
void IndexDocument::SetSectionAttribute(indexid_t indexId, const StringView& attrStr)
{
assert(indexId != INVALID_INDEXID);
if (attrStr == StringView::empty_instance()) {
return;
}
if ((indexid_t)_sectionAttributeVec.size() <= indexId) {
_sectionAttributeVec.resize(indexId + 1, StringView::empty_instance());
}
_sectionAttributeVec[indexId] = attrStr;
}
const StringView& IndexDocument::GetSectionAttribute(indexid_t indexId) const
{
if (indexId >= (indexid_t)_sectionAttributeVec.size()) {
return StringView::empty_instance();
}
return _sectionAttributeVec[indexId];
}
IndexDocument::FieldVector::const_iterator IndexDocument::GetFieldBegin() { return _fields.begin(); }
IndexDocument::FieldVector::const_iterator IndexDocument::GetFieldEnd() { return _fields.end(); }
const std::string& IndexDocument::GetPrimaryKey() const { return _primaryKey; }
void IndexDocument::SetPrimaryKey(const std::string& primaryKey) { _primaryKey = primaryKey; }
void IndexDocument::serialize(DataBuffer& dataBuffer) const
{
SerializeFieldVector(dataBuffer, _fields);
dataBuffer.write(_primaryKey);
SerializeHashMap(dataBuffer, _payloads);
SerializeHashMap(dataBuffer, _termPayloads);
dataBuffer.write(_sectionAttributeVec);
}
void IndexDocument::deserialize(DataBuffer& dataBuffer, mem_pool::Pool* pool, uint32_t docVersion)
{
_fieldCount = DeserializeFieldVector(dataBuffer, _fields, _pool, (docVersion <= 4));
dataBuffer.read(_primaryKey);
DeserializeHashMap(dataBuffer, _payloads);
DeserializeHashMap(dataBuffer, _termPayloads);
dataBuffer.read(_sectionAttributeVec, _pool);
}
indexid_t IndexDocument::GetMaxIndexIdInSectionAttribute() const
{
if (_sectionAttributeVec.empty()) {
return INVALID_INDEXID;
}
return (indexid_t)(_sectionAttributeVec.size() - 1);
}
template <typename K, typename V>
void IndexDocument::SerializeHashMap(DataBuffer& dataBuffer, const util::HashMap<K, V>& hashMap)
{
typedef typename util::HashMap<K, V> HashMapType;
typename HashMapType::Iterator it = hashMap.CreateIterator();
dataBuffer.write(hashMap.Size());
while (it.HasNext()) {
typename HashMapType::KeyValuePair& p = it.Next();
dataBuffer.write(p.first);
dataBuffer.write(p.second);
}
}
template <typename K, typename V>
void IndexDocument::DeserializeHashMap(DataBuffer& dataBuffer, util::HashMap<K, V>& hashMap)
{
size_t size;
dataBuffer.read(size);
hashMap.Clear();
while (size--) {
K k;
V v;
dataBuffer.read(k);
dataBuffer.read(v);
hashMap.FindAndInsert(k, v);
}
}
void IndexDocument::SerializeFieldVector(DataBuffer& dataBuffer, const FieldVector& fields)
{
uint32_t size = fields.size();
dataBuffer.write(size);
for (uint32_t i = 0; i < size; ++i) {
uint8_t descriptor = GenerateFieldDescriptor(fields[i]);
dataBuffer.write(descriptor);
if (descriptor != 0) {
dataBuffer.write(*(fields[i]));
}
}
}
uint32_t IndexDocument::DeserializeFieldVector(DataBuffer& dataBuffer, FieldVector& fields, Pool* pool, bool isLegacy)
{
uint32_t size = 0;
dataBuffer.read(size);
fields.resize(size);
uint32_t fieldCount = 0;
for (uint32_t i = 0; i < size; ++i) {
bool fieldExist = false;
Field::FieldTag fieldTag = Field::FieldTag::TOKEN_FIELD;
uint8_t descriptor = 0;
dataBuffer.read(descriptor);
fieldExist = (descriptor != 0);
if (fieldExist && !isLegacy) {
fieldTag = GetFieldTagFromFieldDescriptor(descriptor);
}
if (!fieldExist) {
fields[i] = NULL;
} else {
fields[i] = CreateFieldByTag(pool, fieldTag, true);
if (NULL == fields[i]) {
INDEXLIB_FATAL_ERROR(DocumentDeserialize, "invalid fieldTag[%d]", static_cast<int8_t>(fieldTag));
}
dataBuffer.read(*(fields[i]));
if (fields[i]->GetFieldId() != INVALID_FIELDID) {
fieldCount++;
}
}
}
return fieldCount;
}
uint8_t IndexDocument::GenerateFieldDescriptor(const Field* field)
{
if (field == NULL) {
return 0;
}
int8_t fieldTagNum = static_cast<int8_t>(field->GetFieldTag());
return Field::FIELD_DESCRIPTOR_MASK | fieldTagNum;
}
Field::FieldTag IndexDocument::GetFieldTagFromFieldDescriptor(uint8_t fieldDescriptor)
{
return static_cast<Field::FieldTag>((~Field::FIELD_DESCRIPTOR_MASK) & fieldDescriptor);
}
Field* IndexDocument::CreateFieldByTag(autil::mem_pool::Pool* pool, Field::FieldTag fieldTag, bool fieldHasPool)
{
auto fieldPool = fieldHasPool ? pool : nullptr;
if (fieldTag == Field::FieldTag::TOKEN_FIELD) {
return IE_POOL_COMPATIBLE_NEW_CLASS(pool, IndexTokenizeField, fieldPool);
} else if (fieldTag == Field::FieldTag::RAW_FIELD) {
return IE_POOL_COMPATIBLE_NEW_CLASS(pool, IndexRawField, fieldPool);
} else if (fieldTag == Field::FieldTag::NULL_FIELD) {
return IE_POOL_COMPATIBLE_NEW_CLASS(pool, NullField, fieldPool);
return NULL;
} else {
AUTIL_LOG(ERROR, "invalid fieldTag:[%u]", static_cast<uint16_t>(fieldTag));
ERROR_COLLECTOR_LOG(ERROR, "invalid fieldTag:[%u]", static_cast<uint16_t>(fieldTag));
return NULL;
}
}
// invalid after modifiedTokens push/set
const ModifiedTokens* IndexDocument::GetFieldModifiedTokens(fieldid_t fieldId) const
{
assert(fieldId >= 0);
if (fieldId >= static_cast<fieldid_t>(_modifiedTokens.size())) {
return nullptr;
}
if (_modifiedTokens[fieldId].FieldId() == fieldId) {
return &_modifiedTokens[fieldId];
}
return nullptr;
}
void IndexDocument::PushModifiedToken(fieldid_t fieldId, uint64_t termKey, ModifiedTokens::Operation op)
{
assert(fieldId != INVALID_FIELDID);
if ((fieldid_t)_modifiedTokens.size() <= fieldId) {
_modifiedTokens.resize(fieldId + 1);
}
if (!_modifiedTokens[fieldId].Valid()) {
_modifiedTokens[fieldId] = ModifiedTokens(fieldId);
}
_modifiedTokens[fieldId].Push(op, termKey);
}
void IndexDocument::SetNullTermModifiedOperation(fieldid_t fieldId, ModifiedTokens::Operation op)
{
assert(fieldId != INVALID_FIELDID);
if ((fieldid_t)_modifiedTokens.size() <= fieldId) {
_modifiedTokens.resize(fieldId + 1);
}
if (!_modifiedTokens[fieldId].Valid()) {
_modifiedTokens[fieldId] = ModifiedTokens(fieldId);
}
_modifiedTokens[fieldId].SetNullTermOperation(op);
}
void IndexDocument::serializeVersion10(autil::DataBuffer& dataBuffer) const { dataBuffer.write(_modifiedTokens); }
void IndexDocument::deserializeVersion10(autil::DataBuffer& dataBuffer) { dataBuffer.read(_modifiedTokens); }
void IndexDocument::serializeVersion11(autil::DataBuffer& dataBuffer) const { dataBuffer.write(_termOriginValueMap); }
void IndexDocument::deserializeVersion11(autil::DataBuffer& dataBuffer) { dataBuffer.read(_termOriginValueMap); }
void IndexDocument::AddTermOriginValue(const TermOriginValueMap& termOriginValueMap)
{
for (const auto& [indexName, newTermHashMap] : termOriginValueMap) {
auto it = _termOriginValueMap.find(indexName);
if (it == _termOriginValueMap.end()) {
_termOriginValueMap.insert(it, std::make_pair(indexName, newTermHashMap));
} else {
auto& currentHashMap = it->second;
currentHashMap.insert(newTermHashMap.begin(), newTermHashMap.end());
}
}
return;
}
} // namespace indexlib::document