0

I am trying to create an index by using FAISS to the specified path on Docker container. This happens within Python script which is executed during container creation.

The code for this is shown below (some of the functions' definitions were removed deliberately, so that code could be more understandable):

Main logic:

def train(index, pickles):
    remove_old_index()
    train_block = pickles[:config.TRAIN_BLOCK_SIZE]
    data_matrix = np.empty([0, config.EMBEDDING_DIM], dtype='float32')
    for p in train_block:
        with open(os.path.join('pickles', p), 'rb') as f:
            batch = pickle.load(f)
            print(f'loaded "{p}" with {len(batch)} records for train')
            _data_matrix = np.array(
                list(map(lambda x: x['embedding'], batch))
            ).astype('float32')
            data_matrix = np.append(data_matrix, _data_matrix, 0)
    print('training..')
    index.train(data_matrix)
    print(f'saving {config.TRAINED_INDEX_PATH} ..')
    faiss.write_index(index, config.TRAINED_INDEX_PATH)


def create_block_after_train(i, pickle_batch):
    _index = faiss.read_index(config.TRAINED_INDEX_PATH)
    data_matrix = np.array(
        list(map(lambda x: x['embedding'], pickle_batch))
    ).astype('float32')
    data_ids = np.array(
        list(map(lambda x: x['id'], pickle_batch))
    ).astype('int')
    _index.add_with_ids(data_matrix, data_ids)
    print(f'ids range: {data_ids[0]} .. {data_ids[-1]}')
    block_index_path = f'indices/block_{i}.index'
    remove_old_block(block_index_path)
    faiss.write_index(_index, block_index_path)
    print(f'saved {block_index_path}')
    return block_index_path


def create_blocks(pickles):
    block_index_files = []

    for i in pickles:
        pickle_batch = get_pickle_batch(i)
        print(f'loaded "{i}" with {len(pickle_batch)} records')

        block_index_path = create_block_after_train(i, pickle_batch)
        block_index_files.append(block_index_path)

    return block_index_files


def merge(block_index_files, pickles):
    ivfs = []
    print(f'appending IVFs ..')
    for block_index_file in block_index_files:
        print(f'appending {block_index_file} ..')
        _index = faiss.read_index(block_index_file, faiss.IO_FLAG_MMAP)
        ivfs.append(_index.invlists)
        _index.own_invlists = False
    print('creating final index..')
    index = faiss.read_index(config.TRAINED_INDEX_PATH)
    merged_ivf_file = f'indices/merged_{config.INDEX}.ivfdata'
    print(f'merging ifvdata to {merged_ivf_file}')
    invlists = faiss.OnDiskInvertedLists(
        index.nlist, index.code_size, merged_ivf_file
    )
    ivf_vector = faiss.InvertedListsPtrVector()
    print(f'creating ivf vector with size {len(ivfs)}..')
    for ivf in ivfs:
        ivf_vector.push_back(ivf)

    print(f'merging..')
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
    index.ntotal = ntotal
    print(f'replace_invlists..')
    index.replace_invlists(invlists)
    return index


def run():
    pickles = get_pickles()

    metric = faiss.METRIC_L2
    index = faiss.index_factory(config.EMBEDDING_DIM, config.INDEX_OPT, metric)

    print(f'total pickles: {len(pickles)}, reading..')
    start_time = time.time()

    train(index, pickles)

    block_index_files = create_blocks(pickles)
    index = merge(block_index_files, pickles)

    print('saving index..')

    print(index.ntotal)
    faiss.write_index(index, config.INDEX_PATH)
    print("--- %s seconds ---" % (time.time() - start_time))

Dockerfile:

FROM python:3.10-slim-buster

ENV PYTHONUNBUFFERED=1 \
    COLUMNS=200 \
    TZ=Asia/Almaty \
    POETRY_VERSION=1.2.2 \
    POETRY_NO_INTERACTION=1 \
    POETRY_VIRTUALENVS_CREATE=false \
    POETRY_CACHE_DIR='/var/cache/pypoetry' \
    POETRY_HOME='/usr/local'

RUN apt-get update && apt-get upgrade -y \
    && apt-get install --no-install-recommends -y \
    bash \
    build-essential \
    cmake \
    ffmpeg \
    libsm6 \
    libxext6 \
    curl \
    && curl -sSL 'https://install.python-poetry.org' | python - \
    && poetry --version \
    && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
    && apt-get clean -y && rm -rf /var/lib/apt/lists/* \
    && ln -fs /usr/share/zoneinfo/Asia/Almaty /etc/localtime \
    && echo "Asia/Almaty" > /etc/timezone

COPY ./poetry.lock ./pyproject.toml ./

WORKDIR /src

RUN poetry run pip install -U pip && poetry install

RUN mkdir pickles

COPY *.py ./

COPY entrypoint.sh ./

RUN chmod +x './entrypoint.sh'

CMD ["./entrypoint.sh"]

The entrypoint.sh runs the code from Main logic shown above.

Poetry dependencies:

python = ">=3.9, <3.11"
faiss-cpu = "1.7.4"
numpy = "1.21.6"
pymongo = "4.4.0"

Console logs:

...
face_indexer  | appending IVFs ..
face_indexer  | appending indices/block_380.pickle.index ..
face_indexer  | appending indices/block_1120.pickle.index ..
face_indexer  | appending indices/block_420.pickle.index ..
face_indexer  | appending indices/block_800.pickle.index ..
face_indexer  | appending indices/block_1260.pickle.index ..
face_indexer  | appending indices/block_1360.pickle.index ..
face_indexer  | appending indices/block_1080.pickle.index ..
face_indexer  | appending indices/block_700.pickle.index ..
face_indexer  | appending indices/block_240.pickle.index ..
face_indexer  | appending indices/block_520.pickle.index ..
face_indexer  | appending indices/block_840.pickle.index ..
face_indexer  | appending indices/block_980.pickle.index ..
face_indexer  | appending indices/block_660.pickle.index ..
face_indexer  | appending indices/block_140.pickle.index ..
face_indexer  | appending indices/block_1220.pickle.index ..
face_indexer  | appending indices/block_560.pickle.index ..
face_indexer  | appending indices/block_280.pickle.index ..
face_indexer  | appending indices/block_940.pickle.index ..
face_indexer  | appending indices/block_100.pickle.index ..
face_indexer  | appending indices/block_0.pickle.index ..
face_indexer  | creating final index..
face_indexer  | merging ifvdata to indices/merged_test.ivfdata
face_indexer  | creating ivf vector with size 20..
face_indexer  | merging..
face_indexer  | resizing indices/merged_test.ivfdata to 2816720 bytes
face_indexer  | replace_invlists..
face_indexer  | saving index..
face_indexer  | 1370
face_indexer  | ./entrypoint.sh: line 2:     7 Segmentation fault      (core dumped) python3 main.py
face_indexer exited with code 139

Important to note that the same code works without any issues on server with Ubuntu 20.04.5 LTS distro, while I am using Manjaro 23.0.0.

Tried to install different versions of faiss-cpu and also tried to install it via conda. Nothing works.

0 Answers0