I am trying to create an index by using FAISS to the specified path on Docker container. This happens within Python script which is executed during container creation.
The code for this is shown below (some of the functions' definitions were removed deliberately, so that code could be more understandable):
Main logic:
def train(index, pickles):
remove_old_index()
train_block = pickles[:config.TRAIN_BLOCK_SIZE]
data_matrix = np.empty([0, config.EMBEDDING_DIM], dtype='float32')
for p in train_block:
with open(os.path.join('pickles', p), 'rb') as f:
batch = pickle.load(f)
print(f'loaded "{p}" with {len(batch)} records for train')
_data_matrix = np.array(
list(map(lambda x: x['embedding'], batch))
).astype('float32')
data_matrix = np.append(data_matrix, _data_matrix, 0)
print('training..')
index.train(data_matrix)
print(f'saving {config.TRAINED_INDEX_PATH} ..')
faiss.write_index(index, config.TRAINED_INDEX_PATH)
def create_block_after_train(i, pickle_batch):
_index = faiss.read_index(config.TRAINED_INDEX_PATH)
data_matrix = np.array(
list(map(lambda x: x['embedding'], pickle_batch))
).astype('float32')
data_ids = np.array(
list(map(lambda x: x['id'], pickle_batch))
).astype('int')
_index.add_with_ids(data_matrix, data_ids)
print(f'ids range: {data_ids[0]} .. {data_ids[-1]}')
block_index_path = f'indices/block_{i}.index'
remove_old_block(block_index_path)
faiss.write_index(_index, block_index_path)
print(f'saved {block_index_path}')
return block_index_path
def create_blocks(pickles):
block_index_files = []
for i in pickles:
pickle_batch = get_pickle_batch(i)
print(f'loaded "{i}" with {len(pickle_batch)} records')
block_index_path = create_block_after_train(i, pickle_batch)
block_index_files.append(block_index_path)
return block_index_files
def merge(block_index_files, pickles):
ivfs = []
print(f'appending IVFs ..')
for block_index_file in block_index_files:
print(f'appending {block_index_file} ..')
_index = faiss.read_index(block_index_file, faiss.IO_FLAG_MMAP)
ivfs.append(_index.invlists)
_index.own_invlists = False
print('creating final index..')
index = faiss.read_index(config.TRAINED_INDEX_PATH)
merged_ivf_file = f'indices/merged_{config.INDEX}.ivfdata'
print(f'merging ifvdata to {merged_ivf_file}')
invlists = faiss.OnDiskInvertedLists(
index.nlist, index.code_size, merged_ivf_file
)
ivf_vector = faiss.InvertedListsPtrVector()
print(f'creating ivf vector with size {len(ivfs)}..')
for ivf in ivfs:
ivf_vector.push_back(ivf)
print(f'merging..')
ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
index.ntotal = ntotal
print(f'replace_invlists..')
index.replace_invlists(invlists)
return index
def run():
pickles = get_pickles()
metric = faiss.METRIC_L2
index = faiss.index_factory(config.EMBEDDING_DIM, config.INDEX_OPT, metric)
print(f'total pickles: {len(pickles)}, reading..')
start_time = time.time()
train(index, pickles)
block_index_files = create_blocks(pickles)
index = merge(block_index_files, pickles)
print('saving index..')
print(index.ntotal)
faiss.write_index(index, config.INDEX_PATH)
print("--- %s seconds ---" % (time.time() - start_time))
Dockerfile:
FROM python:3.10-slim-buster
ENV PYTHONUNBUFFERED=1 \
COLUMNS=200 \
TZ=Asia/Almaty \
POETRY_VERSION=1.2.2 \
POETRY_NO_INTERACTION=1 \
POETRY_VIRTUALENVS_CREATE=false \
POETRY_CACHE_DIR='/var/cache/pypoetry' \
POETRY_HOME='/usr/local'
RUN apt-get update && apt-get upgrade -y \
&& apt-get install --no-install-recommends -y \
bash \
build-essential \
cmake \
ffmpeg \
libsm6 \
libxext6 \
curl \
&& curl -sSL 'https://install.python-poetry.org' | python - \
&& poetry --version \
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
&& apt-get clean -y && rm -rf /var/lib/apt/lists/* \
&& ln -fs /usr/share/zoneinfo/Asia/Almaty /etc/localtime \
&& echo "Asia/Almaty" > /etc/timezone
COPY ./poetry.lock ./pyproject.toml ./
WORKDIR /src
RUN poetry run pip install -U pip && poetry install
RUN mkdir pickles
COPY *.py ./
COPY entrypoint.sh ./
RUN chmod +x './entrypoint.sh'
CMD ["./entrypoint.sh"]
The entrypoint.sh runs the code from Main logic shown above.
Poetry dependencies:
python = ">=3.9, <3.11"
faiss-cpu = "1.7.4"
numpy = "1.21.6"
pymongo = "4.4.0"
Console logs:
...
face_indexer | appending IVFs ..
face_indexer | appending indices/block_380.pickle.index ..
face_indexer | appending indices/block_1120.pickle.index ..
face_indexer | appending indices/block_420.pickle.index ..
face_indexer | appending indices/block_800.pickle.index ..
face_indexer | appending indices/block_1260.pickle.index ..
face_indexer | appending indices/block_1360.pickle.index ..
face_indexer | appending indices/block_1080.pickle.index ..
face_indexer | appending indices/block_700.pickle.index ..
face_indexer | appending indices/block_240.pickle.index ..
face_indexer | appending indices/block_520.pickle.index ..
face_indexer | appending indices/block_840.pickle.index ..
face_indexer | appending indices/block_980.pickle.index ..
face_indexer | appending indices/block_660.pickle.index ..
face_indexer | appending indices/block_140.pickle.index ..
face_indexer | appending indices/block_1220.pickle.index ..
face_indexer | appending indices/block_560.pickle.index ..
face_indexer | appending indices/block_280.pickle.index ..
face_indexer | appending indices/block_940.pickle.index ..
face_indexer | appending indices/block_100.pickle.index ..
face_indexer | appending indices/block_0.pickle.index ..
face_indexer | creating final index..
face_indexer | merging ifvdata to indices/merged_test.ivfdata
face_indexer | creating ivf vector with size 20..
face_indexer | merging..
face_indexer | resizing indices/merged_test.ivfdata to 2816720 bytes
face_indexer | replace_invlists..
face_indexer | saving index..
face_indexer | 1370
face_indexer | ./entrypoint.sh: line 2: 7 Segmentation fault (core dumped) python3 main.py
face_indexer exited with code 139
Important to note that the same code works without any issues on server with Ubuntu 20.04.5 LTS distro, while I am using Manjaro 23.0.0.
Tried to install different versions of faiss-cpu and also tried to install it via conda. Nothing works.