Initial pull request for efficient splade (#16)
* release configs for efficient splade

* update environment with efficient splade

* fix overwrite final

* Add readme for pisa indexes

* force_new for inverted indexes
cadurosar committed Jul 8, 2022
1 parent b346a27 commit a16b793
81 changes: 73 additions & 8 deletions conda_splade_env.yml
name: splade
- pytorch
- conda-forge
- defaults
- _libgcc_mutex=0.1=main
Expand All @@ -10,19 11,26 @@ dependencies:
- aiosignal=1.2.0=pyhd3eb1b0_0
- async-timeout=4.0.1=pyhd3eb1b0_0
- attrs=21.4.0=pyhd3eb1b0_0
- binutils=2.35.1=hdd6e379_2
- binutils_impl_linux-64=2.35.1=h27ae35d_9
- binutils_linux-64=2.35.1=h454624a_30
- blas=1.0=mkl
- blinker=1.4=py38h06a4308_0
- brotli=1.0.9=he6710b0_2
- brotlipy=0.7.0=py38h27cfd23_1003
- bzip2=1.0.8=h7b6447c_0
- c-ares=1.18.1=h7f8727e_0
- ca-certificates=2022.4.26=h06a4308_0
- c-compiler=1.2.0=h7f98852_0
- ca-certificates=2022.6.15=ha878542_0
- cachetools=4.2.2=pyhd3eb1b0_0
- certifi=2021.10.8=py38h06a4308_2
- certifi=2022.6.15=py38h578d9bd_0
- cffi=1.15.0=py38hd667e15_1
- charset-normalizer=2.0.4=pyhd3eb1b0_0
- click=8.1.3=py38h578d9bd_0
- cmake=3.20.2=h541d2ed_0
- cryptography=36.0.0=py38h9ce1e76_0
- cudatoolkit=11.3.1=h2bc3f7f_2
- cxx-compiler=1.2.0=h4bd325d_0
- cycler=0.11.0=pyhd3eb1b0_0
- dataclasses=0.8=pyh6d0b6a4_7
- dbus=1.13.18=hb2f20db_0
Expand All @@ -32,6 40,8 @@ dependencies:
- fonttools=4.25.0=pyhd3eb1b0_0
- freetype=2.11.0=h70c0345_0
- frozenlist=1.2.0=py38h7f8727e_0
- gcc_impl_linux-64=9.3.0=h70c0ae5_19
- gcc_linux-64=9.3.0=h1ee779e_30
- giflib=5.2.1=h7b6447c_0
- glib=2.69.1=h4ff587b_1
- gmp=6.2.1=h2531618_2
Expand All @@ -41,27 51,38 @@ dependencies:
- grpcio=1.42.0=py38hce63b2e_0
- gst-plugins-base=1.14.0=h8213a91_2
- gstreamer=1.14.0=h28cd5cc_2
- gxx_impl_linux-64=9.3.0=hd87eabc_19
- gxx_linux-64=9.3.0=h7e70986_30
- h5py=3.6.0=py38ha0f2276_0
- hdf5=1.10.6=hb1b8bf9_0
- icu=58.2=he6710b0_3
- idna=3.3=pyhd3eb1b0_0
- importlib-metadata=4.11.3=py38h06a4308_0
- intel-openmp=2021.4.0=h06a4308_3561
- jpeg=9e=h7f8727e_0
- kernel-headers_linux-64=2.6.32=he073ed8_15
- kiwisolver=1.3.2=py38h295c915_0
- krb5=1.19.2=hcc1bbae_0
- lame=3.100=h7b6447c_0
- lcms2=2.12=h3be6417_0
- ld_impl_linux-64=2.35.1=h7274673_9
- libcurl=7.82.0=h0b77cf5_0
- libedit=3.1.20210910=h7f8727e_0
- libev=4.33=h516909a_1
- libffi=3.3=he6710b0_2
- libgcc-devel_linux-64=9.3.0=h7864c58_19
- libgcc-ng=9.3.0=h5101ec6_17
- libgfortran-ng=7.5.0=ha8ba4b0_17
- libgfortran4=7.5.0=ha8ba4b0_17
- libgomp=9.3.0=h5101ec6_17
- libiconv=1.16=h7f8727e_2
- libidn2=2.3.2=h7f8727e_0
- libllvm11=11.1.0=h3826bc1_1
- libnghttp2=1.46.0=hce63b2e_0
- libpng=1.6.37=hbc83047_0
- libprotobuf=3.19.1=h4ff587b_0
- libssh2=1.10.0=h8f2d780_0
- libstdcxx-devel_linux-64=9.3.0=hb016644_19
- libstdcxx-ng=9.3.0=hd4cf53a_17
- libtasn1=4.16.0=h27cfd23_0
- libtiff=4.2.0=h85742a9_0
Expand Down Expand Up @@ -90,7 111,8 @@ dependencies:
- numpy-base=1.21.5=py38hf524024_1
- oauthlib=3.1.0=py_0
- openh264=2.1.1=h4ff587b_0
- openssl=1.1.1n=h7f8727e_0
- openjdk=11.0.13=h87a67e3_0
- openssl=1.1.1o=h7f8727e_0
- packaging=21.3=pyhd3eb1b0_0
- pcre=8.45=h295c915_0
- pillow=9.0.1=py38h22f2fdc_0
Expand All @@ -105,18 127,20 @@ dependencies:
- pysocks=1.7.1=py38h06a4308_0
- python=3.8.13=h12debd9_0
- python-dateutil=2.8.2=pyhd3eb1b0_0
- python_abi=3.8=2_cp38
- pytorch=1.11.0=py3.8_cuda11.3_cudnn8.2.0_0
- pytorch-mutex=1.0=cuda
- qt=5.9.7=h5867ecd_1
- readline=8.1.2=h7f8727e_1
- requests=2.27.1=pyhd3eb1b0_0
- requests-oauthlib=1.3.0=py_0
- rhash=1.4.1=h7f98852_0
- rsa=4.7.2=pyhd3eb1b0_1
- setuptools=61.2.0=py38h06a4308_0
- sip=4.19.13=py38h295c915_0
- six=1.16.0=pyhd3eb1b0_1
- sqlite=3.38.2=hc218d9a_0
- tbb=2021.5.0=hd09550d_0
- sysroot_linux-64=2.12=he073ed8_15
- tensorboard=2.6.0=py_1
- tensorboard-data-server=0.6.0=py38hca6d32c_0
- tensorboard-plugin-wit=1.6.0=py_0
Expand All @@ -126,38 150,79 @@ dependencies:
- tornado=6.1=py38h27cfd23_0
- typing-extensions=4.1.1=hd3eb1b0_0
- typing_extensions=4.1.1=pyh06a4308_0
- urllib3=1.26.9=py38h06a4308_0
- werkzeug=2.0.3=pyhd3eb1b0_0
- wheel=0.37.1=pyhd3eb1b0_0
- xz=5.2.5=h7b6447c_0
- yarl=1.6.3=py38h27cfd23_0
- zipp=3.8.0=py38h06a4308_0
- zlib=1.2.12=h7f8727e_2
- zstd=1.4.9=haebb681_0
- pip:
- antlr4-python3-runtime==4.8
- beautifulsoup4==4.11.1
- beir==1.0.0
- click==8.1.3
- blis==0.7.8
- catalogue==2.0.7
- cssselect==1.1.0
- cymem==2.0.6
- cython==0.29.30
- elasticsearch==7.9.1
- faiss-cpu==1.7.2
- feedfinder2==0.0.4
- feedparser==6.0.10
- filelock==3.6.0
- flatbuffers==2.0
- huggingface-hub==0.5.1
- hydra-core==1.1.2
- importlib-resources==5.2.3
- jieba3k==0.35.1
- jinja2==3.1.2
- joblib==1.1.0
- langcodes==3.3.0
- lightgbm==3.3.2
- lxml==4.6.3
- markupsafe==2.1.1
- murmurhash==1.0.7
- newspaper3k==0.2.8
- nltk==3.7
- nmslib==2.1.1
- omegaconf==2.1.2
- onnxruntime==1.11.1
- pandas==1.4.3
- pathy==0.6.2
- preshed==3.0.6
- psutil==5.9.1
- pybind11==2.6.1
- pydantic==1.8.2
- pyjnius==1.4.2
- pyparsing==3.0.8
- pyserini==0.17.0
- pytrec-eval==0.5
- pyyaml==6.0
- regex==2022.4.24
- requests-file==1.5.1
- sacremoses==0.0.53
- scikit-learn==1.0.2
- scipy==1.8.0
- sentence-transformers==2.2.0
- sentencepiece==0.1.96
- sgmllib3k==1.0.0
- smart-open==5.2.1
- soupsieve==2.3.2.post1
- spacy==3.3.1
- spacy-legacy==3.0.9
- spacy-loggers==1.0.2
- srsly==2.4.3
- tbb==2021.6.0
- tbb-devel==2021.6.0
- thinc==8.0.17
- threadpoolctl==3.1.0
- tinysegmenter==0.3
- tldextract==3.3.0
- tokenizers==0.12.1
- tqdm==4.64.0
- transformers==4.18.0
- zipp==3.8.0
prefix: /home/classanc/miniconda3/envs/splade
- typer==0.4.2
- urllib3==1.25.11
- warcio==1.7.4
- wasabi==0.9.1
32 changes: 32 additions & 0 deletions conf/efficient_splade/config_BT_VI_large.yaml
Original file line number Diff line number Diff line change
@@ -0,0 1,32 @@
# @package _global_

defaults: # (these specify which config FILES to use)
############## TRAIN ###################################
- ../train/config: efficient_splade
- ../train/data: distil_from_ensemble
- ../train/model: separate_mixed_mlmflops
############## INDEX ###################################
- ../index: msmarco
############## RETRIEVE ################################
- ../retrieve_evaluate: all
############### FLOPS ##################################
- ../flops: msmarco

# Direct PARAMETER setting
loss: KlDiv
lambda_d: 5e-4
T: 50000
targeted_rep: rep
reg: FLOPS
lambda_q: 5e-4
T: 50000
targeted_rep: rep
reg: L1
checkpoint_dir: models/efficient/VI/BT/large/checkpoint
index_dir: models/efficient/VI/BT/large/index
out_dir: models/efficient/VI/BT/large/out
32 changes: 32 additions & 0 deletions conf/efficient_splade/config_BT_VI_medium.yaml
Original file line number Diff line number Diff line change
@@ -0,0 1,32 @@
# @package _global_

defaults: # (these specify which config FILES to use)
############## TRAIN ###################################
- ../train/config: efficient_splade
- ../train/data: distil_from_ensemble
- ../train/model: separate_mixed_mlmflops
############## INDEX ###################################
- ../index: msmarco
############## RETRIEVE ################################
- ../retrieve_evaluate: all
############### FLOPS ##################################
- ../flops: msmarco

# Direct PARAMETER setting
loss: KlDiv
lambda_d: 5e-4
T: 50000
targeted_rep: rep
reg: FLOPS
lambda_q: 5e-3
T: 50000
targeted_rep: rep
reg: L1
checkpoint_dir: models/efficient/VI/BT/medium/checkpoint
index_dir: models/efficient/VI/BT/medium/index
out_dir: models/efficient/VI/BT/medium/out
32 changes: 32 additions & 0 deletions conf/efficient_splade/config_BT_VI_small.yaml
Original file line number Diff line number Diff line change
@@ -0,0 1,32 @@
# @package _global_

defaults: # (these specify which config FILES to use)
############## TRAIN ###################################
- ../train/config: efficient_splade
- ../train/data: distil_from_ensemble
- ../train/model: separate_mixed_mlmflops
############## INDEX ###################################
- ../index: msmarco
############## RETRIEVE ################################
- ../retrieve_evaluate: all
############### FLOPS ##################################
- ../flops: msmarco

# Direct PARAMETER setting
loss: KlDiv
lambda_d: 5e-3
T: 50000
targeted_rep: rep
reg: FLOPS
lambda_q: 5e-3
T: 50000
targeted_rep: rep
reg: L1
checkpoint_dir: models/efficient/VI/BT/small/checkpoint
index_dir: models/efficient/VI/BT/small/index
out_dir: models/efficient/VI/BT/small/out
26 changes: 26 additions & 0 deletions conf/efficient_splade/config_V_from_huggingface.yaml
Original file line number Diff line number Diff line change
@@ -0,0 1,26 @@
# @package _global_

defaults: # (these specify which config FILES to use)
############## INDEX ###################################
- ../index: msmarco
############## RETRIEVE ################################
- ../retrieve_evaluate: all
############### FLOPS ##################################
- ../flops: msmarco

# Direct PARAMETER setting
checkpoint_dir: models/hf/V/large/checkpoint
index_dir: models/hf/V/large/index
out_dir: models/hf/V/large/out
pretrained_no_yamlconfig: true
tokenizer_type: naver/efficient-splade-V-large-doc
index_batch_size: 500
eval_batch_size: 500
model_type_or_dir: naver/efficient-splade-V-large-doc
model_type_or_dir_q: naver/efficient-splade-V-large-query
freeze_d_model: 0
agg: max
fp16: true
32 changes: 32 additions & 0 deletions conf/efficient_splade/config_V_large.yaml
Original file line number Diff line number Diff line change
@@ -0,0 1,32 @@
# @package _global_

defaults: # (these specify which config FILES to use)
############## TRAIN ###################################
- ../train/config: efficient_splade
- ../train/data: distil_from_ensemble
- ../train/model: separate_distilbert_mlmflops
############## INDEX ###################################
- ../index: msmarco
############## RETRIEVE ################################
- ../retrieve_evaluate: all
############### FLOPS ##################################
- ../flops: msmarco

# Direct PARAMETER setting
loss: KlDiv
lambda_d: 5e-4
T: 50000
targeted_rep: rep
reg: FLOPS
lambda_q: 5e-4
T: 50000
targeted_rep: rep
reg: L1
checkpoint_dir: models/efficient/V/large/checkpoint
index_dir: models/efficient/V/large/index
out_dir: models/efficient/V/large/out
32 changes: 32 additions & 0 deletions conf/efficient_splade/config_V_medium.yaml
Original file line number Diff line number Diff line change
@@ -0,0 1,32 @@
# @package _global_

defaults: # (these specify which config FILES to use)
############## TRAIN ###################################
- ../train/config: efficient_splade
- ../train/data: distil_from_ensemble
- ../train/model: separate_distilbert_mlmflops
############## INDEX ###################################
- ../index: msmarco
############## RETRIEVE ################################
- ../retrieve_evaluate: all
############### FLOPS ##################################
- ../flops: msmarco

# Direct PARAMETER setting
loss: KlDiv
lambda_d: 5e-4
T: 50000
targeted_rep: rep
reg: FLOPS
lambda_q: 5e-3
T: 50000
targeted_rep: rep
reg: L1
checkpoint_dir: models/efficient/V/medium/checkpoint
index_dir: models/efficient/V/medium/index
out_dir: models/efficient/V/medium/out

