diff --git a/conda_splade_env.yml b/conda_splade_env.yml index e1d9b0c..09280df 100644 --- a/conda_splade_env.yml +++ b/conda_splade_env.yml @@ -1,6 +1,7 @@ name: splade channels: - pytorch + - conda-forge - defaults dependencies: - _libgcc_mutex=0.1=main @@ -10,19 +11,26 @@ dependencies: - aiosignal=1.2.0=pyhd3eb1b0_0 - async-timeout=4.0.1=pyhd3eb1b0_0 - attrs=21.4.0=pyhd3eb1b0_0 + - binutils=2.35.1=hdd6e379_2 + - binutils_impl_linux-64=2.35.1=h27ae35d_9 + - binutils_linux-64=2.35.1=h454624a_30 - blas=1.0=mkl - blinker=1.4=py38h06a4308_0 - brotli=1.0.9=he6710b0_2 - brotlipy=0.7.0=py38h27cfd23_1003 - bzip2=1.0.8=h7b6447c_0 - c-ares=1.18.1=h7f8727e_0 - - ca-certificates=2022.4.26=h06a4308_0 + - c-compiler=1.2.0=h7f98852_0 + - ca-certificates=2022.6.15=ha878542_0 - cachetools=4.2.2=pyhd3eb1b0_0 - - certifi=2021.10.8=py38h06a4308_2 + - certifi=2022.6.15=py38h578d9bd_0 - cffi=1.15.0=py38hd667e15_1 - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - click=8.1.3=py38h578d9bd_0 + - cmake=3.20.2=h541d2ed_0 - cryptography=36.0.0=py38h9ce1e76_0 - cudatoolkit=11.3.1=h2bc3f7f_2 + - cxx-compiler=1.2.0=h4bd325d_0 - cycler=0.11.0=pyhd3eb1b0_0 - dataclasses=0.8=pyh6d0b6a4_7 - dbus=1.13.18=hb2f20db_0 @@ -32,6 +40,8 @@ dependencies: - fonttools=4.25.0=pyhd3eb1b0_0 - freetype=2.11.0=h70c0345_0 - frozenlist=1.2.0=py38h7f8727e_0 + - gcc_impl_linux-64=9.3.0=h70c0ae5_19 + - gcc_linux-64=9.3.0=h1ee779e_30 - giflib=5.2.1=h7b6447c_0 - glib=2.69.1=h4ff587b_1 - gmp=6.2.1=h2531618_2 @@ -41,6 +51,8 @@ dependencies: - grpcio=1.42.0=py38hce63b2e_0 - gst-plugins-base=1.14.0=h8213a91_2 - gstreamer=1.14.0=h28cd5cc_2 + - gxx_impl_linux-64=9.3.0=hd87eabc_19 + - gxx_linux-64=9.3.0=h7e70986_30 - h5py=3.6.0=py38ha0f2276_0 - hdf5=1.10.6=hb1b8bf9_0 - icu=58.2=he6710b0_3 @@ -48,11 +60,17 @@ dependencies: - importlib-metadata=4.11.3=py38h06a4308_0 - intel-openmp=2021.4.0=h06a4308_3561 - jpeg=9e=h7f8727e_0 + - kernel-headers_linux-64=2.6.32=he073ed8_15 - kiwisolver=1.3.2=py38h295c915_0 + - krb5=1.19.2=hcc1bbae_0 - lame=3.100=h7b6447c_0 - lcms2=2.12=h3be6417_0 - ld_impl_linux-64=2.35.1=h7274673_9 + - libcurl=7.82.0=h0b77cf5_0 + - libedit=3.1.20210910=h7f8727e_0 + - libev=4.33=h516909a_1 - libffi=3.3=he6710b0_2 + - libgcc-devel_linux-64=9.3.0=h7864c58_19 - libgcc-ng=9.3.0=h5101ec6_17 - libgfortran-ng=7.5.0=ha8ba4b0_17 - libgfortran4=7.5.0=ha8ba4b0_17 @@ -60,8 +78,11 @@ dependencies: - libiconv=1.16=h7f8727e_2 - libidn2=2.3.2=h7f8727e_0 - libllvm11=11.1.0=h3826bc1_1 + - libnghttp2=1.46.0=hce63b2e_0 - libpng=1.6.37=hbc83047_0 - libprotobuf=3.19.1=h4ff587b_0 + - libssh2=1.10.0=h8f2d780_0 + - libstdcxx-devel_linux-64=9.3.0=hb016644_19 - libstdcxx-ng=9.3.0=hd4cf53a_17 - libtasn1=4.16.0=h27cfd23_0 - libtiff=4.2.0=h85742a9_0 @@ -90,7 +111,8 @@ dependencies: - numpy-base=1.21.5=py38hf524024_1 - oauthlib=3.1.0=py_0 - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 + - openjdk=11.0.13=h87a67e3_0 + - openssl=1.1.1o=h7f8727e_0 - packaging=21.3=pyhd3eb1b0_0 - pcre=8.45=h295c915_0 - pillow=9.0.1=py38h22f2fdc_0 @@ -105,18 +127,20 @@ dependencies: - pysocks=1.7.1=py38h06a4308_0 - python=3.8.13=h12debd9_0 - python-dateutil=2.8.2=pyhd3eb1b0_0 + - python_abi=3.8=2_cp38 - pytorch=1.11.0=py3.8_cuda11.3_cudnn8.2.0_0 - pytorch-mutex=1.0=cuda - qt=5.9.7=h5867ecd_1 - readline=8.1.2=h7f8727e_1 - requests=2.27.1=pyhd3eb1b0_0 - requests-oauthlib=1.3.0=py_0 + - rhash=1.4.1=h7f98852_0 - rsa=4.7.2=pyhd3eb1b0_1 - setuptools=61.2.0=py38h06a4308_0 - sip=4.19.13=py38h295c915_0 - six=1.16.0=pyhd3eb1b0_1 - sqlite=3.38.2=hc218d9a_0 - - tbb=2021.5.0=hd09550d_0 + - sysroot_linux-64=2.12=he073ed8_15 - tensorboard=2.6.0=py_1 - tensorboard-data-server=0.6.0=py38hca6d32c_0 - tensorboard-plugin-wit=1.6.0=py_0 @@ -126,38 +150,79 @@ dependencies: - tornado=6.1=py38h27cfd23_0 - typing-extensions=4.1.1=hd3eb1b0_0 - typing_extensions=4.1.1=pyh06a4308_0 - - urllib3=1.26.9=py38h06a4308_0 - werkzeug=2.0.3=pyhd3eb1b0_0 - wheel=0.37.1=pyhd3eb1b0_0 - xz=5.2.5=h7b6447c_0 - yarl=1.6.3=py38h27cfd23_0 + - zipp=3.8.0=py38h06a4308_0 - zlib=1.2.12=h7f8727e_2 - zstd=1.4.9=haebb681_0 - pip: - antlr4-python3-runtime==4.8 + - beautifulsoup4==4.11.1 - beir==1.0.0 - - click==8.1.3 + - blis==0.7.8 + - catalogue==2.0.7 + - cssselect==1.1.0 + - cymem==2.0.6 + - cython==0.29.30 - elasticsearch==7.9.1 - faiss-cpu==1.7.2 + - feedfinder2==0.0.4 + - feedparser==6.0.10 - filelock==3.6.0 + - flatbuffers==2.0 - huggingface-hub==0.5.1 - hydra-core==1.1.2 - importlib-resources==5.2.3 + - jieba3k==0.35.1 + - jinja2==3.1.2 - joblib==1.1.0 + - langcodes==3.3.0 + - lightgbm==3.3.2 + - lxml==4.6.3 + - markupsafe==2.1.1 + - murmurhash==1.0.7 + - newspaper3k==0.2.8 - nltk==3.7 + - nmslib==2.1.1 - omegaconf==2.1.2 + - onnxruntime==1.11.1 + - pandas==1.4.3 + - pathy==0.6.2 + - preshed==3.0.6 + - psutil==5.9.1 + - pybind11==2.6.1 + - pydantic==1.8.2 + - pyjnius==1.4.2 - pyparsing==3.0.8 + - pyserini==0.17.0 - pytrec-eval==0.5 - pyyaml==6.0 - regex==2022.4.24 + - requests-file==1.5.1 - sacremoses==0.0.53 - scikit-learn==1.0.2 - scipy==1.8.0 - sentence-transformers==2.2.0 - sentencepiece==0.1.96 + - sgmllib3k==1.0.0 + - smart-open==5.2.1 + - soupsieve==2.3.2.post1 + - spacy==3.3.1 + - spacy-legacy==3.0.9 + - spacy-loggers==1.0.2 + - srsly==2.4.3 + - tbb==2021.6.0 + - tbb-devel==2021.6.0 + - thinc==8.0.17 - threadpoolctl==3.1.0 + - tinysegmenter==0.3 + - tldextract==3.3.0 - tokenizers==0.12.1 - tqdm==4.64.0 - transformers==4.18.0 - - zipp==3.8.0 -prefix: /home/classanc/miniconda3/envs/splade + - typer==0.4.2 + - urllib3==1.25.11 + - warcio==1.7.4 + - wasabi==0.9.1 \ No newline at end of file diff --git a/conf/efficient_splade/config_BT_VI_large.yaml b/conf/efficient_splade/config_BT_VI_large.yaml new file mode 100644 index 0000000..84af1d2 --- /dev/null +++ b/conf/efficient_splade/config_BT_VI_large.yaml @@ -0,0 +1,32 @@ +# @package _global_ + +# FILES +defaults: # (these specify which config FILES to use) + ############## TRAIN ################################### + - ../train/config: efficient_splade + - ../train/data: distil_from_ensemble + - ../train/model: separate_mixed_mlmflops + ############## INDEX ################################### + - ../index: msmarco + ############## RETRIEVE ################################ + - ../retrieve_evaluate: all + ############### FLOPS ################################## + - ../flops: msmarco + +# Direct PARAMETER setting +config: + loss: KlDiv + regularizer: + FLOPS: + lambda_d: 5e-4 + T: 50000 + targeted_rep: rep + reg: FLOPS + L1: + lambda_q: 5e-4 + T: 50000 + targeted_rep: rep + reg: L1 + checkpoint_dir: models/efficient/VI/BT/large/checkpoint + index_dir: models/efficient/VI/BT/large/index + out_dir: models/efficient/VI/BT/large/out \ No newline at end of file diff --git a/conf/efficient_splade/config_BT_VI_medium.yaml b/conf/efficient_splade/config_BT_VI_medium.yaml new file mode 100644 index 0000000..6758ead --- /dev/null +++ b/conf/efficient_splade/config_BT_VI_medium.yaml @@ -0,0 +1,32 @@ +# @package _global_ + +# FILES +defaults: # (these specify which config FILES to use) + ############## TRAIN ################################### + - ../train/config: efficient_splade + - ../train/data: distil_from_ensemble + - ../train/model: separate_mixed_mlmflops + ############## INDEX ################################### + - ../index: msmarco + ############## RETRIEVE ################################ + - ../retrieve_evaluate: all + ############### FLOPS ################################## + - ../flops: msmarco + +# Direct PARAMETER setting +config: + loss: KlDiv + regularizer: + FLOPS: + lambda_d: 5e-4 + T: 50000 + targeted_rep: rep + reg: FLOPS + L1: + lambda_q: 5e-3 + T: 50000 + targeted_rep: rep + reg: L1 + checkpoint_dir: models/efficient/VI/BT/medium/checkpoint + index_dir: models/efficient/VI/BT/medium/index + out_dir: models/efficient/VI/BT/medium/out \ No newline at end of file diff --git a/conf/efficient_splade/config_BT_VI_small.yaml b/conf/efficient_splade/config_BT_VI_small.yaml new file mode 100644 index 0000000..74514fe --- /dev/null +++ b/conf/efficient_splade/config_BT_VI_small.yaml @@ -0,0 +1,32 @@ +# @package _global_ + +# FILES +defaults: # (these specify which config FILES to use) + ############## TRAIN ################################### + - ../train/config: efficient_splade + - ../train/data: distil_from_ensemble + - ../train/model: separate_mixed_mlmflops + ############## INDEX ################################### + - ../index: msmarco + ############## RETRIEVE ################################ + - ../retrieve_evaluate: all + ############### FLOPS ################################## + - ../flops: msmarco + +# Direct PARAMETER setting +config: + loss: KlDiv + regularizer: + FLOPS: + lambda_d: 5e-3 + T: 50000 + targeted_rep: rep + reg: FLOPS + L1: + lambda_q: 5e-3 + T: 50000 + targeted_rep: rep + reg: L1 + checkpoint_dir: models/efficient/VI/BT/small/checkpoint + index_dir: models/efficient/VI/BT/small/index + out_dir: models/efficient/VI/BT/small/out \ No newline at end of file diff --git a/conf/efficient_splade/config_V_from_huggingface.yaml b/conf/efficient_splade/config_V_from_huggingface.yaml new file mode 100644 index 0000000..6620e83 --- /dev/null +++ b/conf/efficient_splade/config_V_from_huggingface.yaml @@ -0,0 +1,26 @@ +# @package _global_ + +# FILES +defaults: # (these specify which config FILES to use) + ############## INDEX ################################### + - ../index: msmarco + ############## RETRIEVE ################################ + - ../retrieve_evaluate: all + ############### FLOPS ################################## + - ../flops: msmarco + +# Direct PARAMETER setting +config: + checkpoint_dir: models/hf/V/large/checkpoint + index_dir: models/hf/V/large/index + out_dir: models/hf/V/large/out + pretrained_no_yamlconfig: true + tokenizer_type: naver/efficient-splade-V-large-doc + index_batch_size: 500 + eval_batch_size: 500 +init_dict: + model_type_or_dir: naver/efficient-splade-V-large-doc + model_type_or_dir_q: naver/efficient-splade-V-large-query + freeze_d_model: 0 + agg: max + fp16: true \ No newline at end of file diff --git a/conf/efficient_splade/config_V_large.yaml b/conf/efficient_splade/config_V_large.yaml new file mode 100644 index 0000000..3115d4f --- /dev/null +++ b/conf/efficient_splade/config_V_large.yaml @@ -0,0 +1,32 @@ +# @package _global_ + +# FILES +defaults: # (these specify which config FILES to use) + ############## TRAIN ################################### + - ../train/config: efficient_splade + - ../train/data: distil_from_ensemble + - ../train/model: separate_distilbert_mlmflops + ############## INDEX ################################### + - ../index: msmarco + ############## RETRIEVE ################################ + - ../retrieve_evaluate: all + ############### FLOPS ################################## + - ../flops: msmarco + +# Direct PARAMETER setting +config: + loss: KlDiv + regularizer: + FLOPS: + lambda_d: 5e-4 + T: 50000 + targeted_rep: rep + reg: FLOPS + L1: + lambda_q: 5e-4 + T: 50000 + targeted_rep: rep + reg: L1 + checkpoint_dir: models/efficient/V/large/checkpoint + index_dir: models/efficient/V/large/index + out_dir: models/efficient/V/large/out \ No newline at end of file diff --git a/conf/efficient_splade/config_V_medium.yaml b/conf/efficient_splade/config_V_medium.yaml new file mode 100644 index 0000000..8948eed --- /dev/null +++ b/conf/efficient_splade/config_V_medium.yaml @@ -0,0 +1,32 @@ +# @package _global_ + +# FILES +defaults: # (these specify which config FILES to use) + ############## TRAIN ################################### + - ../train/config: efficient_splade + - ../train/data: distil_from_ensemble + - ../train/model: separate_distilbert_mlmflops + ############## INDEX ################################### + - ../index: msmarco + ############## RETRIEVE ################################ + - ../retrieve_evaluate: all + ############### FLOPS ################################## + - ../flops: msmarco + +# Direct PARAMETER setting +config: + loss: KlDiv + regularizer: + FLOPS: + lambda_d: 5e-4 + T: 50000 + targeted_rep: rep + reg: FLOPS + L1: + lambda_q: 5e-3 + T: 50000 + targeted_rep: rep + reg: L1 + checkpoint_dir: models/efficient/V/medium/checkpoint + index_dir: models/efficient/V/medium/index + out_dir: models/efficient/V/medium/out \ No newline at end of file diff --git a/conf/efficient_splade/config_V_small.yaml b/conf/efficient_splade/config_V_small.yaml new file mode 100644 index 0000000..c027804 --- /dev/null +++ b/conf/efficient_splade/config_V_small.yaml @@ -0,0 +1,32 @@ +# @package _global_ + +# FILES +defaults: # (these specify which config FILES to use) + ############## TRAIN ################################### + - ../train/config: efficient_splade + - ../train/data: distil_from_ensemble + - ../train/model: separate_distilbert_mlmflops + ############## INDEX ################################### + - ../index: msmarco + ############## RETRIEVE ################################ + - ../retrieve_evaluate: all + ############### FLOPS ################################## + - ../flops: msmarco + +# Direct PARAMETER setting +config: + loss: KlDiv + regularizer: + FLOPS: + lambda_d: 5e-3 + T: 50000 + targeted_rep: rep + reg: FLOPS + L1: + lambda_q: 5e-3 + T: 50000 + targeted_rep: rep + reg: L1 + checkpoint_dir: models/efficient/V/small/checkpoint + index_dir: models/efficient/V/small/index + out_dir: models/efficient/V/small/out diff --git a/conf/train/config/efficient_splade.yaml b/conf/train/config/efficient_splade.yaml new file mode 100644 index 0000000..c179d69 --- /dev/null +++ b/conf/train/config/efficient_splade.yaml @@ -0,0 +1,20 @@ +# @package config + +lr: 2e-5 +seed: 123 +gradient_accumulation_steps: 1 +weight_decay: 0.01 +validation_metrics: [ MRR@10, recall@100, recall@200, recall@500 ] +pretrained_no_yamlconfig: false +nb_iterations: 250000 +train_batch_size: 128 # number of gpus needs to divide this +eval_batch_size: 600 +index_retrieve_batch_size: 500 +record_frequency: 10000 +train_monitoring_freq: 500 +warmup_steps: 6000 +max_length: 256 +fp16: true +matching_type: splade +monitoring_ckpt: MRR@10 # or e.g. MRR@10 +overwrite_final: true \ No newline at end of file diff --git a/conf/train/model/separate_distilbert_mlmflops.yaml b/conf/train/model/separate_distilbert_mlmflops.yaml new file mode 100644 index 0000000..0696e02 --- /dev/null +++ b/conf/train/model/separate_distilbert_mlmflops.yaml @@ -0,0 +1,11 @@ +# @package _global_ + +init_dict: + model_type_or_dir: models/mlm_flops/DistilBERT + model_type_or_dir_q: models/mlm_flops/DistilBERT + freeze_d_model: 0 + agg: max + fp16: true + +config: + tokenizer_type: models/mlm_flops/DistilBERT \ No newline at end of file diff --git a/conf/train/model/separate_mixed_mlmflops.yaml b/conf/train/model/separate_mixed_mlmflops.yaml new file mode 100644 index 0000000..bd37526 --- /dev/null +++ b/conf/train/model/separate_mixed_mlmflops.yaml @@ -0,0 +1,11 @@ +# @package _global_ + +init_dict: + model_type_or_dir: models/mlm_flops/DistilBERT + model_type_or_dir_q: models/mlm_flops/BERTiny + freeze_d_model: 0 + agg: max + fp16: true + +config: + tokenizer_type: models/mlm_flops/DistilBERT \ No newline at end of file diff --git a/efficient_splade_pisa/README.md b/efficient_splade_pisa/README.md new file mode 100644 index 0000000..f8de7ed --- /dev/null +++ b/efficient_splade_pisa/README.md @@ -0,0 +1,64 @@ +## Step 0 - Install pisa from the weight-queries branch + +The pre-requisite step is to install pisa. Note that it could take a while. + +``` +git clone https://github.com/pisa-engine/pisa.git +cd pisa +git checkout weight-queries +mkdir build +cd build +cmake .. -DCMAKE_BUILD_TYPE=Release +make +cd ../../ +``` + +## Step 1 - Download indexes and queries and extract here + +``` +wget https://www.dropbox.com/s/odkkbgg8lopcduk/pisa_index.tar.gz?dl=0 -O pisa_index.tar.gz +tar xzvf pisa_index.tar.gz +``` + +## Step 2 - Run parallel retrieval to get QPS and effectiveness + +``` +export level=level_5 #level_5 for V) and level_6 for VI) +export size=large # small, medium or large + +pisa/build/bin/evaluate_queries \ + --encoding block_simdbp \ + --documents indexes/$level/$size.docmap \ + --index indexes/$level/$size.block_simdbp.idx \ + --wand indexes/$level/$size.fixed-40.bmw \ + --algorithm block_max_wand \ + -k 1000 \ + --scorer quantized \ + --weighted \ + --queries queries/$level/$size.pisa.ints \ + --run "$level/$size" > ${level}_${size}.trec + +python -m pyserini.eval.trec_eval -c -M 10 -m recip_rank msmarco-passage-dev-subset ${level}_${size}.trec +python -m pyserini.eval.trec_eval -c -mrecall msmarco-passage-dev-subset ${level}_${size}.trec + +``` + +## Step 3 - Run mono-batch mono-cpu retrieval to get average latency + + +``` +level=level_6 #level_5 for V) and level_6 for VI) +size=medium # small, medium or large + +pisa/build/bin/queries \ + --encoding block_simdbp \ + --index indexes/$level/$size.block_simdbp.idx \ + --wand indexes/$level/$size.fixed-40.bmw \ + --algorithm block_max_wand \ + -k 1000 \ + --scorer quantized \ + --weighted \ + --queries queries/$level/$size.pisa.ints + +``` + diff --git a/src/indexing/inverted_index.py b/src/indexing/inverted_index.py index 3698089..6a526e4 100644 --- a/src/indexing/inverted_index.py +++ b/src/indexing/inverted_index.py @@ -14,7 +14,7 @@ class IndexDictOfArray: - def __init__(self, index_path=None, force_new=False, filename="array_index.h5py", dim_voc=None): + def __init__(self, index_path=None, force_new=True, filename="array_index.h5py", dim_voc=None): if index_path is not None: self.index_path = index_path if not os.path.exists(index_path): diff --git a/src/tasks/base/trainer.py b/src/tasks/base/trainer.py index 0058ff9..fc50f59 100644 --- a/src/tasks/base/trainer.py +++ b/src/tasks/base/trainer.py @@ -66,8 +66,7 @@ def __init__(self, model, loss, optimizer, config, train_loader, validation_loss self.saver = ValidationSaver(loss=True if self.config["monitoring_ckpt"] == "loss" else False) self.val_decision = "loss" if self.config["monitoring_ckpt"] == "loss" else self.config[ "monitoring_ckpt"] - self.overwrite_final = config["monitoring_ckpt"]["overwrite_final"] if "overwrite_final" in config[ - "monitoring_ckpt"] else False + self.overwrite_final = config["overwrite_final"] if "overwrite_final" in config else False self.training_res_handler = open(os.path.join(self.checkpoint_dir, "training_perf.txt"), "a") # => text file in which we record some training perf if self.validation: