Spaces:
Running
Running
added scripts for testing inference
Browse files- pixi.lock +161 -0
- pixi.toml +1 -0
- src/do_pca_on_tfidf.py +70 -0
- src/encode.py +14 -0
- src/search-embeddings.py +67 -0
pixi.lock
CHANGED
|
@@ -11,10 +11,14 @@ environments:
|
|
| 11 |
- conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda
|
| 12 |
- conda: https://conda.anaconda.org/conda-forge/noarch/anyio-4.12.0-pyhcf101f3_0.conda
|
| 13 |
- conda: https://conda.anaconda.org/conda-forge/noarch/asttokens-3.0.1-pyhd8ed1ab_0.conda
|
|
|
|
|
|
|
| 14 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_8.conda
|
| 15 |
- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.11.12-hbd8a1cb_0.conda
|
| 16 |
- conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2025.11.12-pyhd8ed1ab_0.conda
|
|
|
|
| 17 |
- conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda
|
|
|
|
| 18 |
- conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.12.12-py312hd8ed1ab_1.conda
|
| 19 |
- conda: https://conda.anaconda.org/conda-forge/noarch/decorator-5.2.1-pyhd8ed1ab_0.conda
|
| 20 |
- conda: https://conda.anaconda.org/conda-forge/noarch/dnspython-2.8.0-pyhcf101f3_0.conda
|
|
@@ -33,10 +37,12 @@ environments:
|
|
| 33 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/gmpy2-2.2.1-py312hee6aa52_2.conda
|
| 34 |
- conda: https://conda.anaconda.org/conda-forge/noarch/h11-0.16.0-pyhd8ed1ab_0.conda
|
| 35 |
- conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda
|
|
|
|
| 36 |
- conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda
|
| 37 |
- conda: https://conda.anaconda.org/conda-forge/noarch/httpcore-1.0.9-pyh29332c3_0.conda
|
| 38 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/httptools-0.7.1-py312h4409184_1.conda
|
| 39 |
- conda: https://conda.anaconda.org/conda-forge/noarch/httpx-0.28.1-pyhd8ed1ab_0.conda
|
|
|
|
| 40 |
- conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda
|
| 41 |
- conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.11-pyhd8ed1ab_0.conda
|
| 42 |
- conda: https://conda.anaconda.org/conda-forge/noarch/ipython-9.8.0-pyh53cf698_0.conda
|
|
@@ -75,6 +81,7 @@ environments:
|
|
| 75 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/numpy-2.3.5-py312h85ea64e_0.conda
|
| 76 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.6.0-h5503f6c_0.conda
|
| 77 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/optree-0.18.0-py312h84eede6_0.conda
|
|
|
|
| 78 |
- conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.5-pyhcf101f3_0.conda
|
| 79 |
- conda: https://conda.anaconda.org/conda-forge/noarch/pexpect-4.9.0-pyhd8ed1ab_1.conda
|
| 80 |
- conda: https://conda.anaconda.org/conda-forge/noarch/polars-1.35.2-pyh6a1acc5_0.conda
|
|
@@ -88,6 +95,7 @@ environments:
|
|
| 88 |
- conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-2.12.5-pyhcf101f3_1.conda
|
| 89 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/pydantic-core-2.41.5-py312h6ef9ec0_1.conda
|
| 90 |
- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
|
|
|
|
| 91 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.12.12-h18782d2_1_cpython.conda
|
| 92 |
- conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.2.1-pyhcf101f3_0.conda
|
| 93 |
- conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.12.12-hd8ed1ab_1.conda
|
|
@@ -96,6 +104,7 @@ environments:
|
|
| 96 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/pytorch-2.9.1-cpu_generic_py312_hdde6e1b_1.conda
|
| 97 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/pyyaml-6.0.3-py312h5748b74_0.conda
|
| 98 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h1d1bf99_2.conda
|
|
|
|
| 99 |
- conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.2.0-pyhcf101f3_0.conda
|
| 100 |
- conda: https://conda.anaconda.org/conda-forge/noarch/rich-toolkit-0.17.0-pyhcf101f3_0.conda
|
| 101 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/scikit-learn-1.7.2-py312h79e0ffc_0.conda
|
|
@@ -110,6 +119,7 @@ environments:
|
|
| 110 |
- conda: https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.6.0-pyhecae5ae_0.conda
|
| 111 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h892fb3f_3.conda
|
| 112 |
- conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.3.0-pyhcf101f3_0.conda
|
|
|
|
| 113 |
- conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda
|
| 114 |
- conda: https://conda.anaconda.org/conda-forge/noarch/typer-0.20.0-pyhefaf540_1.conda
|
| 115 |
- conda: https://conda.anaconda.org/conda-forge/noarch/typer-slim-0.20.0-pyhcf101f3_1.conda
|
|
@@ -118,6 +128,7 @@ environments:
|
|
| 118 |
- conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.2-pyhd8ed1ab_1.conda
|
| 119 |
- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda
|
| 120 |
- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda
|
|
|
|
| 121 |
- conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-0.38.0-pyh31011fe_0.conda
|
| 122 |
- conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-standard-0.38.0-h31011fe_0.conda
|
| 123 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/uvloop-0.22.1-py312h4409184_1.conda
|
|
@@ -125,6 +136,7 @@ environments:
|
|
| 125 |
- conda: https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.14-pyhd8ed1ab_0.conda
|
| 126 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/websockets-15.0.1-py312h290adc7_2.conda
|
| 127 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/yaml-0.2.5-h925e9cb_3.conda
|
|
|
|
| 128 |
packages:
|
| 129 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/_openmp_mutex-4.5-7_kmp_llvm.conda
|
| 130 |
build_number: 7
|
|
@@ -193,6 +205,33 @@ packages:
|
|
| 193 |
license_family: Apache
|
| 194 |
size: 28797
|
| 195 |
timestamp: 1763410017955
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_8.conda
|
| 197 |
sha256: b456200636bd5fecb2bec63f7e0985ad2097cf1b83d60ce0b6968dffa6d02aa1
|
| 198 |
md5: 58fd217444c2a5701a44244faf518206
|
|
@@ -218,6 +257,15 @@ packages:
|
|
| 218 |
license: ISC
|
| 219 |
size: 157131
|
| 220 |
timestamp: 1762976260320
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
- conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda
|
| 222 |
sha256: 38cfe1ee75b21a8361c8824f5544c3866f303af1762693a178266d7f198e8715
|
| 223 |
md5: ea8a6c3256897cc31263de9f455e25d9
|
|
@@ -229,6 +277,15 @@ packages:
|
|
| 229 |
license_family: BSD
|
| 230 |
size: 97676
|
| 231 |
timestamp: 1764518652276
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
- conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.12.12-py312hd8ed1ab_1.conda
|
| 233 |
noarch: generic
|
| 234 |
sha256: b88c76a6d6b45378552ccfd9e88b2a073161fe83fd1294c8fa103ffd32f7934a
|
|
@@ -442,6 +499,22 @@ packages:
|
|
| 442 |
license_family: MIT
|
| 443 |
size: 95967
|
| 444 |
timestamp: 1756364871835
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
- conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda
|
| 446 |
sha256: 6ad78a180576c706aabeb5b4c8ceb97c0cb25f1e112d76495bff23e3779948ba
|
| 447 |
md5: 0a802cb9888dd14eeefc611f05c40b6e
|
|
@@ -491,6 +564,27 @@ packages:
|
|
| 491 |
license_family: BSD
|
| 492 |
size: 63082
|
| 493 |
timestamp: 1733663449209
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
- conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda
|
| 495 |
sha256: 77af6f5fe8b62ca07d09ac60127a30d9069fdc3c68d6b256754d0ffb1f7779f8
|
| 496 |
md5: 8e6923fc12f1fe8f8c4e5c9f343256ac
|
|
@@ -944,6 +1038,16 @@ packages:
|
|
| 944 |
license_family: Apache
|
| 945 |
size: 390301
|
| 946 |
timestamp: 1763124958546
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 947 |
- conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.5-pyhcf101f3_0.conda
|
| 948 |
sha256: 30de7b4d15fbe53ffe052feccde31223a236dae0495bab54ab2479de30b2990f
|
| 949 |
md5: a110716cdb11cf51482ff4000dc253d7
|
|
@@ -1105,6 +1209,16 @@ packages:
|
|
| 1105 |
license_family: BSD
|
| 1106 |
size: 889287
|
| 1107 |
timestamp: 1750615908735
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1108 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.12.12-h18782d2_1_cpython.conda
|
| 1109 |
build_number: 1
|
| 1110 |
sha256: 626da9bb78459ce541407327d1e22ee673fd74e9103f1a0e0f4e3967ad0a23a7
|
|
@@ -1225,6 +1339,21 @@ packages:
|
|
| 1225 |
license_family: GPL
|
| 1226 |
size: 252359
|
| 1227 |
timestamp: 1740379663071
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1228 |
- conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.2.0-pyhcf101f3_0.conda
|
| 1229 |
sha256: edfb44d0b6468a8dfced728534c755101f06f1a9870a7ad329ec51389f16b086
|
| 1230 |
md5: a247579d8a59931091b16a1e932bbed6
|
|
@@ -1395,6 +1524,15 @@ packages:
|
|
| 1395 |
license_family: MIT
|
| 1396 |
size: 20973
|
| 1397 |
timestamp: 1760014679845
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1398 |
- conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda
|
| 1399 |
sha256: f39a5620c6e8e9e98357507262a7869de2ae8cc07da8b7f84e517c9fd6c2b959
|
| 1400 |
md5: 019a7385be9af33791c989871317e1ed
|
|
@@ -1477,6 +1615,19 @@ packages:
|
|
| 1477 |
license: LicenseRef-Public-Domain
|
| 1478 |
size: 122968
|
| 1479 |
timestamp: 1742727099393
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1480 |
- conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-0.38.0-pyh31011fe_0.conda
|
| 1481 |
sha256: 32e637726fd7cfeb74058e829b116e17514d001846fef56d8c763ec9ec5ac887
|
| 1482 |
md5: d3aa78bc38d9478e9eed5f128ba35f41
|
|
@@ -1563,3 +1714,13 @@ packages:
|
|
| 1563 |
license_family: MIT
|
| 1564 |
size: 83386
|
| 1565 |
timestamp: 1753484079473
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
- conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda
|
| 12 |
- conda: https://conda.anaconda.org/conda-forge/noarch/anyio-4.12.0-pyhcf101f3_0.conda
|
| 13 |
- conda: https://conda.anaconda.org/conda-forge/noarch/asttokens-3.0.1-pyhd8ed1ab_0.conda
|
| 14 |
+
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/backports.zstd-1.2.0-py312h84d6f5f_0.conda
|
| 15 |
+
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/brotli-python-1.2.0-py312h0dfefe5_1.conda
|
| 16 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_8.conda
|
| 17 |
- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.11.12-hbd8a1cb_0.conda
|
| 18 |
- conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2025.11.12-pyhd8ed1ab_0.conda
|
| 19 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.4-pyhd8ed1ab_0.conda
|
| 20 |
- conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda
|
| 21 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
|
| 22 |
- conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.12.12-py312hd8ed1ab_1.conda
|
| 23 |
- conda: https://conda.anaconda.org/conda-forge/noarch/decorator-5.2.1-pyhd8ed1ab_0.conda
|
| 24 |
- conda: https://conda.anaconda.org/conda-forge/noarch/dnspython-2.8.0-pyhcf101f3_0.conda
|
|
|
|
| 37 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/gmpy2-2.2.1-py312hee6aa52_2.conda
|
| 38 |
- conda: https://conda.anaconda.org/conda-forge/noarch/h11-0.16.0-pyhd8ed1ab_0.conda
|
| 39 |
- conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda
|
| 40 |
+
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/hf-xet-1.2.1-py310h6ce4931_0.conda
|
| 41 |
- conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda
|
| 42 |
- conda: https://conda.anaconda.org/conda-forge/noarch/httpcore-1.0.9-pyh29332c3_0.conda
|
| 43 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/httptools-0.7.1-py312h4409184_1.conda
|
| 44 |
- conda: https://conda.anaconda.org/conda-forge/noarch/httpx-0.28.1-pyhd8ed1ab_0.conda
|
| 45 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/huggingface_hub-1.2.1-pyhd8ed1ab_0.conda
|
| 46 |
- conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda
|
| 47 |
- conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.11-pyhd8ed1ab_0.conda
|
| 48 |
- conda: https://conda.anaconda.org/conda-forge/noarch/ipython-9.8.0-pyh53cf698_0.conda
|
|
|
|
| 81 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/numpy-2.3.5-py312h85ea64e_0.conda
|
| 82 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.6.0-h5503f6c_0.conda
|
| 83 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/optree-0.18.0-py312h84eede6_0.conda
|
| 84 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda
|
| 85 |
- conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.5-pyhcf101f3_0.conda
|
| 86 |
- conda: https://conda.anaconda.org/conda-forge/noarch/pexpect-4.9.0-pyhd8ed1ab_1.conda
|
| 87 |
- conda: https://conda.anaconda.org/conda-forge/noarch/polars-1.35.2-pyh6a1acc5_0.conda
|
|
|
|
| 95 |
- conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-2.12.5-pyhcf101f3_1.conda
|
| 96 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/pydantic-core-2.41.5-py312h6ef9ec0_1.conda
|
| 97 |
- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
|
| 98 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda
|
| 99 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.12.12-h18782d2_1_cpython.conda
|
| 100 |
- conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.2.1-pyhcf101f3_0.conda
|
| 101 |
- conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.12.12-hd8ed1ab_1.conda
|
|
|
|
| 104 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/pytorch-2.9.1-cpu_generic_py312_hdde6e1b_1.conda
|
| 105 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/pyyaml-6.0.3-py312h5748b74_0.conda
|
| 106 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h1d1bf99_2.conda
|
| 107 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhd8ed1ab_0.conda
|
| 108 |
- conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.2.0-pyhcf101f3_0.conda
|
| 109 |
- conda: https://conda.anaconda.org/conda-forge/noarch/rich-toolkit-0.17.0-pyhcf101f3_0.conda
|
| 110 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/scikit-learn-1.7.2-py312h79e0ffc_0.conda
|
|
|
|
| 119 |
- conda: https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.6.0-pyhecae5ae_0.conda
|
| 120 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h892fb3f_3.conda
|
| 121 |
- conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.3.0-pyhcf101f3_0.conda
|
| 122 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.1-pyhd8ed1ab_1.conda
|
| 123 |
- conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda
|
| 124 |
- conda: https://conda.anaconda.org/conda-forge/noarch/typer-0.20.0-pyhefaf540_1.conda
|
| 125 |
- conda: https://conda.anaconda.org/conda-forge/noarch/typer-slim-0.20.0-pyhcf101f3_1.conda
|
|
|
|
| 128 |
- conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.2-pyhd8ed1ab_1.conda
|
| 129 |
- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda
|
| 130 |
- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda
|
| 131 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.0-pyhd8ed1ab_0.conda
|
| 132 |
- conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-0.38.0-pyh31011fe_0.conda
|
| 133 |
- conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-standard-0.38.0-h31011fe_0.conda
|
| 134 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/uvloop-0.22.1-py312h4409184_1.conda
|
|
|
|
| 136 |
- conda: https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.14-pyhd8ed1ab_0.conda
|
| 137 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/websockets-15.0.1-py312h290adc7_2.conda
|
| 138 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/yaml-0.2.5-h925e9cb_3.conda
|
| 139 |
+
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstd-1.5.7-hbf9d68e_6.conda
|
| 140 |
packages:
|
| 141 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/_openmp_mutex-4.5-7_kmp_llvm.conda
|
| 142 |
build_number: 7
|
|
|
|
| 205 |
license_family: Apache
|
| 206 |
size: 28797
|
| 207 |
timestamp: 1763410017955
|
| 208 |
+
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/backports.zstd-1.2.0-py312h84d6f5f_0.conda
|
| 209 |
+
sha256: 833370729199ef55f3f9efd024e28bba87fcd8b5c397d8afecefde63851e6997
|
| 210 |
+
md5: c0ca697637ef6cf0ac768a50964e4af6
|
| 211 |
+
depends:
|
| 212 |
+
- python
|
| 213 |
+
- __osx >=11.0
|
| 214 |
+
- python 3.12.* *_cpython
|
| 215 |
+
- python_abi 3.12.* *_cp312
|
| 216 |
+
- zstd >=1.5.7,<1.6.0a0
|
| 217 |
+
license: BSD-3-Clause AND MIT AND EPL-2.0
|
| 218 |
+
size: 241337
|
| 219 |
+
timestamp: 1765057702057
|
| 220 |
+
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/brotli-python-1.2.0-py312h0dfefe5_1.conda
|
| 221 |
+
sha256: 6178775a86579d5e8eec6a7ab316c24f1355f6c6ccbe84bb341f342f1eda2440
|
| 222 |
+
md5: 311fcf3f6a8c4eb70f912798035edd35
|
| 223 |
+
depends:
|
| 224 |
+
- __osx >=11.0
|
| 225 |
+
- libcxx >=19
|
| 226 |
+
- python >=3.12,<3.13.0a0
|
| 227 |
+
- python >=3.12,<3.13.0a0 *_cpython
|
| 228 |
+
- python_abi 3.12.* *_cp312
|
| 229 |
+
constrains:
|
| 230 |
+
- libbrotlicommon 1.2.0 hc919400_1
|
| 231 |
+
license: MIT
|
| 232 |
+
license_family: MIT
|
| 233 |
+
size: 359503
|
| 234 |
+
timestamp: 1764018572368
|
| 235 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_8.conda
|
| 236 |
sha256: b456200636bd5fecb2bec63f7e0985ad2097cf1b83d60ce0b6968dffa6d02aa1
|
| 237 |
md5: 58fd217444c2a5701a44244faf518206
|
|
|
|
| 257 |
license: ISC
|
| 258 |
size: 157131
|
| 259 |
timestamp: 1762976260320
|
| 260 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.4-pyhd8ed1ab_0.conda
|
| 261 |
+
sha256: b32f8362e885f1b8417bac2b3da4db7323faa12d5db62b7fd6691c02d60d6f59
|
| 262 |
+
md5: a22d1fd9bf98827e280a02875d9a007a
|
| 263 |
+
depends:
|
| 264 |
+
- python >=3.10
|
| 265 |
+
license: MIT
|
| 266 |
+
license_family: MIT
|
| 267 |
+
size: 50965
|
| 268 |
+
timestamp: 1760437331772
|
| 269 |
- conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda
|
| 270 |
sha256: 38cfe1ee75b21a8361c8824f5544c3866f303af1762693a178266d7f198e8715
|
| 271 |
md5: ea8a6c3256897cc31263de9f455e25d9
|
|
|
|
| 277 |
license_family: BSD
|
| 278 |
size: 97676
|
| 279 |
timestamp: 1764518652276
|
| 280 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
|
| 281 |
+
sha256: ab29d57dc70786c1269633ba3dff20288b81664d3ff8d21af995742e2bb03287
|
| 282 |
+
md5: 962b9857ee8e7018c22f2776ffa0b2d7
|
| 283 |
+
depends:
|
| 284 |
+
- python >=3.9
|
| 285 |
+
license: BSD-3-Clause
|
| 286 |
+
license_family: BSD
|
| 287 |
+
size: 27011
|
| 288 |
+
timestamp: 1733218222191
|
| 289 |
- conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.12.12-py312hd8ed1ab_1.conda
|
| 290 |
noarch: generic
|
| 291 |
sha256: b88c76a6d6b45378552ccfd9e88b2a073161fe83fd1294c8fa103ffd32f7934a
|
|
|
|
| 499 |
license_family: MIT
|
| 500 |
size: 95967
|
| 501 |
timestamp: 1756364871835
|
| 502 |
+
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/hf-xet-1.2.1-py310h6ce4931_0.conda
|
| 503 |
+
noarch: python
|
| 504 |
+
sha256: e101714629795f382b3da88473fff0d1c41010b0c827781b1365960768d14d37
|
| 505 |
+
md5: 25c8979ef595b889ec105be9964738f4
|
| 506 |
+
depends:
|
| 507 |
+
- python
|
| 508 |
+
- __osx >=11.0
|
| 509 |
+
- openssl >=3.5.4,<4.0a0
|
| 510 |
+
- _python_abi3_support 1.*
|
| 511 |
+
- cpython >=3.10
|
| 512 |
+
constrains:
|
| 513 |
+
- __osx >=11.0
|
| 514 |
+
license: Apache-2.0
|
| 515 |
+
license_family: APACHE
|
| 516 |
+
size: 2517013
|
| 517 |
+
timestamp: 1763772770292
|
| 518 |
- conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda
|
| 519 |
sha256: 6ad78a180576c706aabeb5b4c8ceb97c0cb25f1e112d76495bff23e3779948ba
|
| 520 |
md5: 0a802cb9888dd14eeefc611f05c40b6e
|
|
|
|
| 564 |
license_family: BSD
|
| 565 |
size: 63082
|
| 566 |
timestamp: 1733663449209
|
| 567 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/huggingface_hub-1.2.1-pyhd8ed1ab_0.conda
|
| 568 |
+
sha256: aac3429da6b4db29137cecc2c56996c7bb01f4f77a0113ff3e2ad324b2a91d13
|
| 569 |
+
md5: 2bf72e2a44977b75967f218146b3d949
|
| 570 |
+
depends:
|
| 571 |
+
- filelock
|
| 572 |
+
- fsspec >=2023.5.0
|
| 573 |
+
- hf-xet >=1.2.0,<2.0.0
|
| 574 |
+
- httpx >=0.23.0,<1
|
| 575 |
+
- packaging >=20.9
|
| 576 |
+
- python >=3.10
|
| 577 |
+
- pyyaml >=5.1
|
| 578 |
+
- requests
|
| 579 |
+
- shellingham
|
| 580 |
+
- tqdm >=4.42.1
|
| 581 |
+
- typer-slim
|
| 582 |
+
- typing-extensions >=3.7.4.3
|
| 583 |
+
- typing_extensions >=3.7.4.3
|
| 584 |
+
license: Apache-2.0
|
| 585 |
+
license_family: APACHE
|
| 586 |
+
size: 325254
|
| 587 |
+
timestamp: 1764954470661
|
| 588 |
- conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda
|
| 589 |
sha256: 77af6f5fe8b62ca07d09ac60127a30d9069fdc3c68d6b256754d0ffb1f7779f8
|
| 590 |
md5: 8e6923fc12f1fe8f8c4e5c9f343256ac
|
|
|
|
| 1038 |
license_family: Apache
|
| 1039 |
size: 390301
|
| 1040 |
timestamp: 1763124958546
|
| 1041 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda
|
| 1042 |
+
sha256: 289861ed0c13a15d7bbb408796af4de72c2fe67e2bcb0de98f4c3fce259d7991
|
| 1043 |
+
md5: 58335b26c38bf4a20f399384c33cbcf9
|
| 1044 |
+
depends:
|
| 1045 |
+
- python >=3.8
|
| 1046 |
+
- python
|
| 1047 |
+
license: Apache-2.0
|
| 1048 |
+
license_family: APACHE
|
| 1049 |
+
size: 62477
|
| 1050 |
+
timestamp: 1745345660407
|
| 1051 |
- conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.5-pyhcf101f3_0.conda
|
| 1052 |
sha256: 30de7b4d15fbe53ffe052feccde31223a236dae0495bab54ab2479de30b2990f
|
| 1053 |
md5: a110716cdb11cf51482ff4000dc253d7
|
|
|
|
| 1209 |
license_family: BSD
|
| 1210 |
size: 889287
|
| 1211 |
timestamp: 1750615908735
|
| 1212 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda
|
| 1213 |
+
sha256: ba3b032fa52709ce0d9fd388f63d330a026754587a2f461117cac9ab73d8d0d8
|
| 1214 |
+
md5: 461219d1a5bd61342293efa2c0c90eac
|
| 1215 |
+
depends:
|
| 1216 |
+
- __unix
|
| 1217 |
+
- python >=3.9
|
| 1218 |
+
license: BSD-3-Clause
|
| 1219 |
+
license_family: BSD
|
| 1220 |
+
size: 21085
|
| 1221 |
+
timestamp: 1733217331982
|
| 1222 |
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.12.12-h18782d2_1_cpython.conda
|
| 1223 |
build_number: 1
|
| 1224 |
sha256: 626da9bb78459ce541407327d1e22ee673fd74e9103f1a0e0f4e3967ad0a23a7
|
|
|
|
| 1339 |
license_family: GPL
|
| 1340 |
size: 252359
|
| 1341 |
timestamp: 1740379663071
|
| 1342 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhd8ed1ab_0.conda
|
| 1343 |
+
sha256: 8dc54e94721e9ab545d7234aa5192b74102263d3e704e6d0c8aa7008f2da2a7b
|
| 1344 |
+
md5: db0c6b99149880c8ba515cf4abe93ee4
|
| 1345 |
+
depends:
|
| 1346 |
+
- certifi >=2017.4.17
|
| 1347 |
+
- charset-normalizer >=2,<4
|
| 1348 |
+
- idna >=2.5,<4
|
| 1349 |
+
- python >=3.9
|
| 1350 |
+
- urllib3 >=1.21.1,<3
|
| 1351 |
+
constrains:
|
| 1352 |
+
- chardet >=3.0.2,<6
|
| 1353 |
+
license: Apache-2.0
|
| 1354 |
+
license_family: APACHE
|
| 1355 |
+
size: 59263
|
| 1356 |
+
timestamp: 1755614348400
|
| 1357 |
- conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.2.0-pyhcf101f3_0.conda
|
| 1358 |
sha256: edfb44d0b6468a8dfced728534c755101f06f1a9870a7ad329ec51389f16b086
|
| 1359 |
md5: a247579d8a59931091b16a1e932bbed6
|
|
|
|
| 1524 |
license_family: MIT
|
| 1525 |
size: 20973
|
| 1526 |
timestamp: 1760014679845
|
| 1527 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.1-pyhd8ed1ab_1.conda
|
| 1528 |
+
sha256: 11e2c85468ae9902d24a27137b6b39b4a78099806e551d390e394a8c34b48e40
|
| 1529 |
+
md5: 9efbfdc37242619130ea42b1cc4ed861
|
| 1530 |
+
depends:
|
| 1531 |
+
- colorama
|
| 1532 |
+
- python >=3.9
|
| 1533 |
+
license: MPL-2.0 or MIT
|
| 1534 |
+
size: 89498
|
| 1535 |
+
timestamp: 1735661472632
|
| 1536 |
- conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda
|
| 1537 |
sha256: f39a5620c6e8e9e98357507262a7869de2ae8cc07da8b7f84e517c9fd6c2b959
|
| 1538 |
md5: 019a7385be9af33791c989871317e1ed
|
|
|
|
| 1615 |
license: LicenseRef-Public-Domain
|
| 1616 |
size: 122968
|
| 1617 |
timestamp: 1742727099393
|
| 1618 |
+
- conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.0-pyhd8ed1ab_0.conda
|
| 1619 |
+
sha256: 2b95dee46e9e7cfaaecb9cc7f3de70d4ce77a2a1aee4538da4bd1ab7a45c7f9f
|
| 1620 |
+
md5: de7372f43e63ff0876b4023b79b55e95
|
| 1621 |
+
depends:
|
| 1622 |
+
- backports.zstd >=1.0.0
|
| 1623 |
+
- brotli-python >=1.2.0
|
| 1624 |
+
- h2 >=4,<5
|
| 1625 |
+
- pysocks >=1.5.6,<2.0,!=1.5.7
|
| 1626 |
+
- python >=3.10
|
| 1627 |
+
license: MIT
|
| 1628 |
+
license_family: MIT
|
| 1629 |
+
size: 102983
|
| 1630 |
+
timestamp: 1764955468239
|
| 1631 |
- conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-0.38.0-pyh31011fe_0.conda
|
| 1632 |
sha256: 32e637726fd7cfeb74058e829b116e17514d001846fef56d8c763ec9ec5ac887
|
| 1633 |
md5: d3aa78bc38d9478e9eed5f128ba35f41
|
|
|
|
| 1714 |
license_family: MIT
|
| 1715 |
size: 83386
|
| 1716 |
timestamp: 1753484079473
|
| 1717 |
+
- conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstd-1.5.7-hbf9d68e_6.conda
|
| 1718 |
+
sha256: 9485ba49e8f47d2b597dd399e88f4802e100851b27c21d7525625b0b4025a5d9
|
| 1719 |
+
md5: ab136e4c34e97f34fb621d2592a393d8
|
| 1720 |
+
depends:
|
| 1721 |
+
- __osx >=11.0
|
| 1722 |
+
- libzlib >=1.3.1,<2.0a0
|
| 1723 |
+
license: BSD-3-Clause
|
| 1724 |
+
license_family: BSD
|
| 1725 |
+
size: 433413
|
| 1726 |
+
timestamp: 1764777166076
|
pixi.toml
CHANGED
|
@@ -17,3 +17,4 @@ numpy = ">=2.3.5,<3"
|
|
| 17 |
fasttext = ">=0.9.2,<0.10"
|
| 18 |
joblib = ">=1.5.2,<2"
|
| 19 |
ipython = ">=9.8.0,<10"
|
|
|
|
|
|
| 17 |
fasttext = ">=0.9.2,<0.10"
|
| 18 |
joblib = ">=1.5.2,<2"
|
| 19 |
ipython = ">=9.8.0,<10"
|
| 20 |
+
huggingface_hub = ">=1.2.1,<2"
|
src/do_pca_on_tfidf.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import polars as pl
|
| 2 |
+
from sklearn.decomposition import TruncatedSVD
|
| 3 |
+
from huggingface_hub import hf_hub_download
|
| 4 |
+
import numpy as np
|
| 5 |
+
from joblib import load
|
| 6 |
+
import scipy
|
| 7 |
+
import fasttext
|
| 8 |
+
|
| 9 |
+
# define the device where torch calculations take place
|
| 10 |
+
my_device = "mps"
|
| 11 |
+
|
| 12 |
+
# load the fasttext model
|
| 13 |
+
fasttext_model = fasttext.load_model(hf_hub_download("facebook/fasttext-en-vectors", "model.bin"))
|
| 14 |
+
|
| 15 |
+
# load the TF-IDF and DTM
|
| 16 |
+
my_df = pl.read_csv("outputs/TF-IDF-doc-text.csv")
|
| 17 |
+
my_vectorizer = load("outputs/tfidf_vectorizer_doc_text.joblib")
|
| 18 |
+
|
| 19 |
+
# vocab embeddings:
|
| 20 |
+
my_vocabulary = my_vectorizer.get_feature_names_out()
|
| 21 |
+
vocab_embeddings = np.array([fasttext_model.get_word_vector(term) for term in my_vocabulary])
|
| 22 |
+
keep_terms = [any(vocab_embeddings[i,] != 0) for i in range(vocab_embeddings.shape[0])]
|
| 23 |
+
|
| 24 |
+
# drop terms that have no embeddings in the fasttext model:
|
| 25 |
+
vocab_embeddings = vocab_embeddings[keep_terms, :]
|
| 26 |
+
my_vocabulary = my_vocabulary[keep_terms]
|
| 27 |
+
|
| 28 |
+
# calculate length of each embedding vector
|
| 29 |
+
vocab_norm = vocab_embeddings / np.linalg.norm(vocab_embeddings, axis=1, keepdims=True)
|
| 30 |
+
|
| 31 |
+
# get the document-term matrix and project it to 300 pseudo-topics.
|
| 32 |
+
doc_term_mat = my_df.select(pl.exclude(["file"]))[:,keep_terms]
|
| 33 |
+
dtm_svd = TruncatedSVD(n_components=300)
|
| 34 |
+
X_svd = dtm_svd.fit_transform(doc_term_mat)
|
| 35 |
+
|
| 36 |
+
def query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration = 10 ):
|
| 37 |
+
# query embeddings:
|
| 38 |
+
query_embeddings = np.array([fasttext_model.get_word_vector(term) for term in query.split()])
|
| 39 |
+
|
| 40 |
+
# Normalize rows
|
| 41 |
+
query_norm = query_embeddings / np.linalg.norm(query_embeddings, axis=1, keepdims=True)
|
| 42 |
+
|
| 43 |
+
# Compute cosine similarity matrix
|
| 44 |
+
query_similarities = np.dot(query_norm, vocab_norm.T)
|
| 45 |
+
query_tfidf = np.reshape(my_vectorizer.idf_[keep_terms], shape=(-1, vocab_norm.shape[0])) * scipy.special.softmax(query_similarities * concentration, axis = 1)
|
| 46 |
+
query_weights = dtm_svd.transform(query_tfidf)
|
| 47 |
+
|
| 48 |
+
# calculate the average TF-IDF score of the query over topics:
|
| 49 |
+
mean_query_score = np.sum(np.mean(query_weights, axis=0) * dtm_svd_mat, axis=1)
|
| 50 |
+
|
| 51 |
+
sorted_df = pl.DataFrame(
|
| 52 |
+
{
|
| 53 |
+
'score-tfidf': mean_query_score,
|
| 54 |
+
'file':my_df['file']
|
| 55 |
+
}).sort("score-tfidf", descending = True).with_columns(pl.Series("rank-tfidf", [i + 1 for i in range(len(mean_query_score))]))
|
| 56 |
+
|
| 57 |
+
#top_df['file'][0]
|
| 58 |
+
return(sorted_df)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def query_factory(dtm_svd, dtm_svd_mat, vocab_norm, concentration = 10):
|
| 63 |
+
def do_query(query):
|
| 64 |
+
return query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration)
|
| 65 |
+
|
| 66 |
+
return do_query
|
| 67 |
+
|
| 68 |
+
query_docs = query_factory(dtm_svd = dtm_svd, dtm_svd_mat = X_svd, vocab_norm=vocab_norm, concentration = 30)
|
| 69 |
+
|
| 70 |
+
res_tfidf = query_docs(query)
|
src/encode.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
#
|
| 4 |
+
def encode(sentences, tokenizer, model, device="mps"):
|
| 5 |
+
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(device = device)
|
| 6 |
+
|
| 7 |
+
with torch.no_grad():
|
| 8 |
+
outputs = model(**inputs)
|
| 9 |
+
|
| 10 |
+
# outputs.last_hidden_state = [batch, tokens, hidden_dim]
|
| 11 |
+
# mean pooling
|
| 12 |
+
embeddings = outputs.last_hidden_state.mean(dim=1)
|
| 13 |
+
|
| 14 |
+
return(embeddings)
|
src/search-embeddings.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import packages
|
| 2 |
+
import numpy as np
|
| 3 |
+
import polars as pl
|
| 4 |
+
|
| 5 |
+
from encode import encode
|
| 6 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 7 |
+
|
| 8 |
+
from transformers import AutoTokenizer, AutoModel
|
| 9 |
+
|
| 10 |
+
import glob
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# define the device where torch calculations take place
|
| 14 |
+
my_device = "mps"
|
| 15 |
+
|
| 16 |
+
# Instantiate the sentence-transformer model:
|
| 17 |
+
model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
|
| 18 |
+
sentence_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 19 |
+
sentence_model = AutoModel.from_pretrained(model_name).to(device = my_device)
|
| 20 |
+
|
| 21 |
+
# import the block embeddings
|
| 22 |
+
prefix = "outputs/block-embeddings-"
|
| 23 |
+
files = glob.glob(prefix + "*") # matches data_*, data_file.txt, data_123.csv, etc.
|
| 24 |
+
|
| 25 |
+
block_embeddings_list = list()
|
| 26 |
+
for filename in files:
|
| 27 |
+
print("Reading:", filename)
|
| 28 |
+
block_embeddings_list.append(pl.read_csv(filename))
|
| 29 |
+
|
| 30 |
+
block_embeddings_df = pl.concat(block_embeddings_list, how = 'vertical')
|
| 31 |
+
|
| 32 |
+
def sbert_query(query, corpus_embeddings_df):
|
| 33 |
+
query_embeddings = encode(query, tokenizer = sentence_tokenizer, model = sentence_model).cpu().numpy()
|
| 34 |
+
|
| 35 |
+
sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx'])))
|
| 36 |
+
|
| 37 |
+
sorted_df = pl.DataFrame(
|
| 38 |
+
{
|
| 39 |
+
'score': np.reshape(sbert_scores, shape=-1),
|
| 40 |
+
'file': corpus_embeddings_df['file'],
|
| 41 |
+
'doc_block_indx': corpus_embeddings_df['doc_block_indx']
|
| 42 |
+
}).group_by("file").agg(pl.col("score").max())
|
| 43 |
+
|
| 44 |
+
#top_df['file'][0]
|
| 45 |
+
return(sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])])))
|
| 46 |
+
|
| 47 |
+
def sbert_query_factory(corpus_embeddings_df):
|
| 48 |
+
def do_sbert_query(my_query):
|
| 49 |
+
return sbert_query(my_query, corpus_embeddings_df)
|
| 50 |
+
|
| 51 |
+
return do_sbert_query
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# create a function to run the SBERT queries
|
| 55 |
+
sbert_query_docs = sbert_query_factory(block_embeddings_df)
|
| 56 |
+
|
| 57 |
+
query = "plans for raising grant revenue directed to the libraries"
|
| 58 |
+
res_sbert = sbert_query_docs(query)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
#res.group_by("file").agg(pl.col("rank").min(), pl.col("score").max()).sort("rank")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
|