Spaces:
Runtime error
Runtime error
| # path setting | |
| CORPUS_DIR=/root/Corpus/mrtydi-korean/collection | |
| CORPUS_PATH=${CORPUS_DIR}/docs.jsonl | |
| INDEX_DIR=indexes/mrtydi-korean | |
| mkdir -p $INDEX_DIR | |
| # sparse indexing | |
| lang=korean | |
| abbr=ko | |
| NUM_THREADS=16 | |
| # `target` directory not found | |
| #echo "sparse (anserini version)" | |
| #target/appassembler/bin/IndexCollection \ | |
| #-collection MrTyDiCollection \ | |
| #-input $CORPUS_DIR \ | |
| #-index $INDEX_DIR/sparse_anserini \ | |
| #-generator DefaultLuceneDocumentGenerator \ | |
| #-threads $NUM_THREADS -storePositions -storeDocvectors -storeRaw -language $abbr | |
| echo "sparse (pyserini version) ========================> SKIP ====================> " | |
| #python -m pyserini.index.lucene \ | |
| #--collection JsonCollection \ | |
| #--input $CORPUS_DIR \ | |
| #--index $INDEX_DIR/sparse_pyserini \ | |
| #--generator DefaultLuceneDocumentGenerator \ | |
| #--language $abbr \ | |
| #--threads $NUM_THREADS \ | |
| #--storePositions --storeDocvectors --storeRaw | |
| # dense indexing | |
| echo "dense" | |
| export CUDA_VISIBLE_DEVICES=1 | |
| BATCH_SIZE=8 | |
| MAXLEN=512 | |
| ENCODER=castorini/mdpr-passage-nq | |
| python -m pyserini.encode input --corpus $CORPUS_PATH \ | |
| --fields title text \ | |
| --delimiter "\n\n" \ | |
| output --embeddings $INDEX_DIR/dense_maxlen$MAXLEN \ | |
| --to-faiss \ | |
| encoder --encoder $ENCODER \ | |
| --fields title text \ | |
| --max-length $MAXLEN \ | |
| --batch $BATCH_SIZE \ | |
| --fp16 | |
| BATCH_SIZE=32 | |
| MAXLEN=256 # default | |
| ENCODER=castorini/mdpr-passage-nq | |
| python -m pyserini.encode input --corpus $CORPUS_PATH \ | |
| --fields title text \ | |
| --delimiter "\n\n" \ | |
| output --embeddings $INDEX_DIR/dense_maxlen$MAXLEN \ | |
| --to-faiss \ | |
| encoder --encoder $ENCODER \ | |
| --fields title text \ | |
| --max-length $MAXLEN \ | |
| --batch $BATCH_SIZE \ | |
| --fp16 |