wbrooks commited on
Commit
c795cd4
·
1 Parent(s): fe9eb34

added scripts for testing inference

Browse files
Files changed (5) hide show
  1. pixi.lock +161 -0
  2. pixi.toml +1 -0
  3. src/do_pca_on_tfidf.py +70 -0
  4. src/encode.py +14 -0
  5. src/search-embeddings.py +67 -0
pixi.lock CHANGED
@@ -11,10 +11,14 @@ environments:
11
  - conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda
12
  - conda: https://conda.anaconda.org/conda-forge/noarch/anyio-4.12.0-pyhcf101f3_0.conda
13
  - conda: https://conda.anaconda.org/conda-forge/noarch/asttokens-3.0.1-pyhd8ed1ab_0.conda
 
 
14
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_8.conda
15
  - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.11.12-hbd8a1cb_0.conda
16
  - conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2025.11.12-pyhd8ed1ab_0.conda
 
17
  - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda
 
18
  - conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.12.12-py312hd8ed1ab_1.conda
19
  - conda: https://conda.anaconda.org/conda-forge/noarch/decorator-5.2.1-pyhd8ed1ab_0.conda
20
  - conda: https://conda.anaconda.org/conda-forge/noarch/dnspython-2.8.0-pyhcf101f3_0.conda
@@ -33,10 +37,12 @@ environments:
33
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/gmpy2-2.2.1-py312hee6aa52_2.conda
34
  - conda: https://conda.anaconda.org/conda-forge/noarch/h11-0.16.0-pyhd8ed1ab_0.conda
35
  - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda
 
36
  - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda
37
  - conda: https://conda.anaconda.org/conda-forge/noarch/httpcore-1.0.9-pyh29332c3_0.conda
38
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/httptools-0.7.1-py312h4409184_1.conda
39
  - conda: https://conda.anaconda.org/conda-forge/noarch/httpx-0.28.1-pyhd8ed1ab_0.conda
 
40
  - conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda
41
  - conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.11-pyhd8ed1ab_0.conda
42
  - conda: https://conda.anaconda.org/conda-forge/noarch/ipython-9.8.0-pyh53cf698_0.conda
@@ -75,6 +81,7 @@ environments:
75
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/numpy-2.3.5-py312h85ea64e_0.conda
76
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.6.0-h5503f6c_0.conda
77
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/optree-0.18.0-py312h84eede6_0.conda
 
78
  - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.5-pyhcf101f3_0.conda
79
  - conda: https://conda.anaconda.org/conda-forge/noarch/pexpect-4.9.0-pyhd8ed1ab_1.conda
80
  - conda: https://conda.anaconda.org/conda-forge/noarch/polars-1.35.2-pyh6a1acc5_0.conda
@@ -88,6 +95,7 @@ environments:
88
  - conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-2.12.5-pyhcf101f3_1.conda
89
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/pydantic-core-2.41.5-py312h6ef9ec0_1.conda
90
  - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
 
91
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.12.12-h18782d2_1_cpython.conda
92
  - conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.2.1-pyhcf101f3_0.conda
93
  - conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.12.12-hd8ed1ab_1.conda
@@ -96,6 +104,7 @@ environments:
96
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/pytorch-2.9.1-cpu_generic_py312_hdde6e1b_1.conda
97
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/pyyaml-6.0.3-py312h5748b74_0.conda
98
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h1d1bf99_2.conda
 
99
  - conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.2.0-pyhcf101f3_0.conda
100
  - conda: https://conda.anaconda.org/conda-forge/noarch/rich-toolkit-0.17.0-pyhcf101f3_0.conda
101
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/scikit-learn-1.7.2-py312h79e0ffc_0.conda
@@ -110,6 +119,7 @@ environments:
110
  - conda: https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.6.0-pyhecae5ae_0.conda
111
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h892fb3f_3.conda
112
  - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.3.0-pyhcf101f3_0.conda
 
113
  - conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda
114
  - conda: https://conda.anaconda.org/conda-forge/noarch/typer-0.20.0-pyhefaf540_1.conda
115
  - conda: https://conda.anaconda.org/conda-forge/noarch/typer-slim-0.20.0-pyhcf101f3_1.conda
@@ -118,6 +128,7 @@ environments:
118
  - conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.2-pyhd8ed1ab_1.conda
119
  - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda
120
  - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda
 
121
  - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-0.38.0-pyh31011fe_0.conda
122
  - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-standard-0.38.0-h31011fe_0.conda
123
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/uvloop-0.22.1-py312h4409184_1.conda
@@ -125,6 +136,7 @@ environments:
125
  - conda: https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.14-pyhd8ed1ab_0.conda
126
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/websockets-15.0.1-py312h290adc7_2.conda
127
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/yaml-0.2.5-h925e9cb_3.conda
 
128
  packages:
129
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/_openmp_mutex-4.5-7_kmp_llvm.conda
130
  build_number: 7
@@ -193,6 +205,33 @@ packages:
193
  license_family: Apache
194
  size: 28797
195
  timestamp: 1763410017955
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_8.conda
197
  sha256: b456200636bd5fecb2bec63f7e0985ad2097cf1b83d60ce0b6968dffa6d02aa1
198
  md5: 58fd217444c2a5701a44244faf518206
@@ -218,6 +257,15 @@ packages:
218
  license: ISC
219
  size: 157131
220
  timestamp: 1762976260320
 
 
 
 
 
 
 
 
 
221
  - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda
222
  sha256: 38cfe1ee75b21a8361c8824f5544c3866f303af1762693a178266d7f198e8715
223
  md5: ea8a6c3256897cc31263de9f455e25d9
@@ -229,6 +277,15 @@ packages:
229
  license_family: BSD
230
  size: 97676
231
  timestamp: 1764518652276
 
 
 
 
 
 
 
 
 
232
  - conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.12.12-py312hd8ed1ab_1.conda
233
  noarch: generic
234
  sha256: b88c76a6d6b45378552ccfd9e88b2a073161fe83fd1294c8fa103ffd32f7934a
@@ -442,6 +499,22 @@ packages:
442
  license_family: MIT
443
  size: 95967
444
  timestamp: 1756364871835
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda
446
  sha256: 6ad78a180576c706aabeb5b4c8ceb97c0cb25f1e112d76495bff23e3779948ba
447
  md5: 0a802cb9888dd14eeefc611f05c40b6e
@@ -491,6 +564,27 @@ packages:
491
  license_family: BSD
492
  size: 63082
493
  timestamp: 1733663449209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
  - conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda
495
  sha256: 77af6f5fe8b62ca07d09ac60127a30d9069fdc3c68d6b256754d0ffb1f7779f8
496
  md5: 8e6923fc12f1fe8f8c4e5c9f343256ac
@@ -944,6 +1038,16 @@ packages:
944
  license_family: Apache
945
  size: 390301
946
  timestamp: 1763124958546
 
 
 
 
 
 
 
 
 
 
947
  - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.5-pyhcf101f3_0.conda
948
  sha256: 30de7b4d15fbe53ffe052feccde31223a236dae0495bab54ab2479de30b2990f
949
  md5: a110716cdb11cf51482ff4000dc253d7
@@ -1105,6 +1209,16 @@ packages:
1105
  license_family: BSD
1106
  size: 889287
1107
  timestamp: 1750615908735
 
 
 
 
 
 
 
 
 
 
1108
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.12.12-h18782d2_1_cpython.conda
1109
  build_number: 1
1110
  sha256: 626da9bb78459ce541407327d1e22ee673fd74e9103f1a0e0f4e3967ad0a23a7
@@ -1225,6 +1339,21 @@ packages:
1225
  license_family: GPL
1226
  size: 252359
1227
  timestamp: 1740379663071
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1228
  - conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.2.0-pyhcf101f3_0.conda
1229
  sha256: edfb44d0b6468a8dfced728534c755101f06f1a9870a7ad329ec51389f16b086
1230
  md5: a247579d8a59931091b16a1e932bbed6
@@ -1395,6 +1524,15 @@ packages:
1395
  license_family: MIT
1396
  size: 20973
1397
  timestamp: 1760014679845
 
 
 
 
 
 
 
 
 
1398
  - conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda
1399
  sha256: f39a5620c6e8e9e98357507262a7869de2ae8cc07da8b7f84e517c9fd6c2b959
1400
  md5: 019a7385be9af33791c989871317e1ed
@@ -1477,6 +1615,19 @@ packages:
1477
  license: LicenseRef-Public-Domain
1478
  size: 122968
1479
  timestamp: 1742727099393
 
 
 
 
 
 
 
 
 
 
 
 
 
1480
  - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-0.38.0-pyh31011fe_0.conda
1481
  sha256: 32e637726fd7cfeb74058e829b116e17514d001846fef56d8c763ec9ec5ac887
1482
  md5: d3aa78bc38d9478e9eed5f128ba35f41
@@ -1563,3 +1714,13 @@ packages:
1563
  license_family: MIT
1564
  size: 83386
1565
  timestamp: 1753484079473
 
 
 
 
 
 
 
 
 
 
 
11
  - conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda
12
  - conda: https://conda.anaconda.org/conda-forge/noarch/anyio-4.12.0-pyhcf101f3_0.conda
13
  - conda: https://conda.anaconda.org/conda-forge/noarch/asttokens-3.0.1-pyhd8ed1ab_0.conda
14
+ - conda: https://conda.anaconda.org/conda-forge/osx-arm64/backports.zstd-1.2.0-py312h84d6f5f_0.conda
15
+ - conda: https://conda.anaconda.org/conda-forge/osx-arm64/brotli-python-1.2.0-py312h0dfefe5_1.conda
16
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_8.conda
17
  - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.11.12-hbd8a1cb_0.conda
18
  - conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2025.11.12-pyhd8ed1ab_0.conda
19
+ - conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.4-pyhd8ed1ab_0.conda
20
  - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda
21
+ - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
22
  - conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.12.12-py312hd8ed1ab_1.conda
23
  - conda: https://conda.anaconda.org/conda-forge/noarch/decorator-5.2.1-pyhd8ed1ab_0.conda
24
  - conda: https://conda.anaconda.org/conda-forge/noarch/dnspython-2.8.0-pyhcf101f3_0.conda
 
37
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/gmpy2-2.2.1-py312hee6aa52_2.conda
38
  - conda: https://conda.anaconda.org/conda-forge/noarch/h11-0.16.0-pyhd8ed1ab_0.conda
39
  - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda
40
+ - conda: https://conda.anaconda.org/conda-forge/osx-arm64/hf-xet-1.2.1-py310h6ce4931_0.conda
41
  - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda
42
  - conda: https://conda.anaconda.org/conda-forge/noarch/httpcore-1.0.9-pyh29332c3_0.conda
43
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/httptools-0.7.1-py312h4409184_1.conda
44
  - conda: https://conda.anaconda.org/conda-forge/noarch/httpx-0.28.1-pyhd8ed1ab_0.conda
45
+ - conda: https://conda.anaconda.org/conda-forge/noarch/huggingface_hub-1.2.1-pyhd8ed1ab_0.conda
46
  - conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda
47
  - conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.11-pyhd8ed1ab_0.conda
48
  - conda: https://conda.anaconda.org/conda-forge/noarch/ipython-9.8.0-pyh53cf698_0.conda
 
81
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/numpy-2.3.5-py312h85ea64e_0.conda
82
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/openssl-3.6.0-h5503f6c_0.conda
83
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/optree-0.18.0-py312h84eede6_0.conda
84
+ - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda
85
  - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.5-pyhcf101f3_0.conda
86
  - conda: https://conda.anaconda.org/conda-forge/noarch/pexpect-4.9.0-pyhd8ed1ab_1.conda
87
  - conda: https://conda.anaconda.org/conda-forge/noarch/polars-1.35.2-pyh6a1acc5_0.conda
 
95
  - conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-2.12.5-pyhcf101f3_1.conda
96
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/pydantic-core-2.41.5-py312h6ef9ec0_1.conda
97
  - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
98
+ - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda
99
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.12.12-h18782d2_1_cpython.conda
100
  - conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.2.1-pyhcf101f3_0.conda
101
  - conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.12.12-hd8ed1ab_1.conda
 
104
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/pytorch-2.9.1-cpu_generic_py312_hdde6e1b_1.conda
105
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/pyyaml-6.0.3-py312h5748b74_0.conda
106
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/readline-8.2-h1d1bf99_2.conda
107
+ - conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhd8ed1ab_0.conda
108
  - conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.2.0-pyhcf101f3_0.conda
109
  - conda: https://conda.anaconda.org/conda-forge/noarch/rich-toolkit-0.17.0-pyhcf101f3_0.conda
110
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/scikit-learn-1.7.2-py312h79e0ffc_0.conda
 
119
  - conda: https://conda.anaconda.org/conda-forge/noarch/threadpoolctl-3.6.0-pyhecae5ae_0.conda
120
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/tk-8.6.13-h892fb3f_3.conda
121
  - conda: https://conda.anaconda.org/conda-forge/noarch/tomli-2.3.0-pyhcf101f3_0.conda
122
+ - conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.1-pyhd8ed1ab_1.conda
123
  - conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda
124
  - conda: https://conda.anaconda.org/conda-forge/noarch/typer-0.20.0-pyhefaf540_1.conda
125
  - conda: https://conda.anaconda.org/conda-forge/noarch/typer-slim-0.20.0-pyhcf101f3_1.conda
 
128
  - conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.2-pyhd8ed1ab_1.conda
129
  - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda
130
  - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda
131
+ - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.0-pyhd8ed1ab_0.conda
132
  - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-0.38.0-pyh31011fe_0.conda
133
  - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-standard-0.38.0-h31011fe_0.conda
134
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/uvloop-0.22.1-py312h4409184_1.conda
 
136
  - conda: https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.14-pyhd8ed1ab_0.conda
137
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/websockets-15.0.1-py312h290adc7_2.conda
138
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/yaml-0.2.5-h925e9cb_3.conda
139
+ - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstd-1.5.7-hbf9d68e_6.conda
140
  packages:
141
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/_openmp_mutex-4.5-7_kmp_llvm.conda
142
  build_number: 7
 
205
  license_family: Apache
206
  size: 28797
207
  timestamp: 1763410017955
208
+ - conda: https://conda.anaconda.org/conda-forge/osx-arm64/backports.zstd-1.2.0-py312h84d6f5f_0.conda
209
+ sha256: 833370729199ef55f3f9efd024e28bba87fcd8b5c397d8afecefde63851e6997
210
+ md5: c0ca697637ef6cf0ac768a50964e4af6
211
+ depends:
212
+ - python
213
+ - __osx >=11.0
214
+ - python 3.12.* *_cpython
215
+ - python_abi 3.12.* *_cp312
216
+ - zstd >=1.5.7,<1.6.0a0
217
+ license: BSD-3-Clause AND MIT AND EPL-2.0
218
+ size: 241337
219
+ timestamp: 1765057702057
220
+ - conda: https://conda.anaconda.org/conda-forge/osx-arm64/brotli-python-1.2.0-py312h0dfefe5_1.conda
221
+ sha256: 6178775a86579d5e8eec6a7ab316c24f1355f6c6ccbe84bb341f342f1eda2440
222
+ md5: 311fcf3f6a8c4eb70f912798035edd35
223
+ depends:
224
+ - __osx >=11.0
225
+ - libcxx >=19
226
+ - python >=3.12,<3.13.0a0
227
+ - python >=3.12,<3.13.0a0 *_cpython
228
+ - python_abi 3.12.* *_cp312
229
+ constrains:
230
+ - libbrotlicommon 1.2.0 hc919400_1
231
+ license: MIT
232
+ license_family: MIT
233
+ size: 359503
234
+ timestamp: 1764018572368
235
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/bzip2-1.0.8-hd037594_8.conda
236
  sha256: b456200636bd5fecb2bec63f7e0985ad2097cf1b83d60ce0b6968dffa6d02aa1
237
  md5: 58fd217444c2a5701a44244faf518206
 
257
  license: ISC
258
  size: 157131
259
  timestamp: 1762976260320
260
+ - conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.4-pyhd8ed1ab_0.conda
261
+ sha256: b32f8362e885f1b8417bac2b3da4db7323faa12d5db62b7fd6691c02d60d6f59
262
+ md5: a22d1fd9bf98827e280a02875d9a007a
263
+ depends:
264
+ - python >=3.10
265
+ license: MIT
266
+ license_family: MIT
267
+ size: 50965
268
+ timestamp: 1760437331772
269
  - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda
270
  sha256: 38cfe1ee75b21a8361c8824f5544c3866f303af1762693a178266d7f198e8715
271
  md5: ea8a6c3256897cc31263de9f455e25d9
 
277
  license_family: BSD
278
  size: 97676
279
  timestamp: 1764518652276
280
+ - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
281
+ sha256: ab29d57dc70786c1269633ba3dff20288b81664d3ff8d21af995742e2bb03287
282
+ md5: 962b9857ee8e7018c22f2776ffa0b2d7
283
+ depends:
284
+ - python >=3.9
285
+ license: BSD-3-Clause
286
+ license_family: BSD
287
+ size: 27011
288
+ timestamp: 1733218222191
289
  - conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.12.12-py312hd8ed1ab_1.conda
290
  noarch: generic
291
  sha256: b88c76a6d6b45378552ccfd9e88b2a073161fe83fd1294c8fa103ffd32f7934a
 
499
  license_family: MIT
500
  size: 95967
501
  timestamp: 1756364871835
502
+ - conda: https://conda.anaconda.org/conda-forge/osx-arm64/hf-xet-1.2.1-py310h6ce4931_0.conda
503
+ noarch: python
504
+ sha256: e101714629795f382b3da88473fff0d1c41010b0c827781b1365960768d14d37
505
+ md5: 25c8979ef595b889ec105be9964738f4
506
+ depends:
507
+ - python
508
+ - __osx >=11.0
509
+ - openssl >=3.5.4,<4.0a0
510
+ - _python_abi3_support 1.*
511
+ - cpython >=3.10
512
+ constrains:
513
+ - __osx >=11.0
514
+ license: Apache-2.0
515
+ license_family: APACHE
516
+ size: 2517013
517
+ timestamp: 1763772770292
518
  - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda
519
  sha256: 6ad78a180576c706aabeb5b4c8ceb97c0cb25f1e112d76495bff23e3779948ba
520
  md5: 0a802cb9888dd14eeefc611f05c40b6e
 
564
  license_family: BSD
565
  size: 63082
566
  timestamp: 1733663449209
567
+ - conda: https://conda.anaconda.org/conda-forge/noarch/huggingface_hub-1.2.1-pyhd8ed1ab_0.conda
568
+ sha256: aac3429da6b4db29137cecc2c56996c7bb01f4f77a0113ff3e2ad324b2a91d13
569
+ md5: 2bf72e2a44977b75967f218146b3d949
570
+ depends:
571
+ - filelock
572
+ - fsspec >=2023.5.0
573
+ - hf-xet >=1.2.0,<2.0.0
574
+ - httpx >=0.23.0,<1
575
+ - packaging >=20.9
576
+ - python >=3.10
577
+ - pyyaml >=5.1
578
+ - requests
579
+ - shellingham
580
+ - tqdm >=4.42.1
581
+ - typer-slim
582
+ - typing-extensions >=3.7.4.3
583
+ - typing_extensions >=3.7.4.3
584
+ license: Apache-2.0
585
+ license_family: APACHE
586
+ size: 325254
587
+ timestamp: 1764954470661
588
  - conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda
589
  sha256: 77af6f5fe8b62ca07d09ac60127a30d9069fdc3c68d6b256754d0ffb1f7779f8
590
  md5: 8e6923fc12f1fe8f8c4e5c9f343256ac
 
1038
  license_family: Apache
1039
  size: 390301
1040
  timestamp: 1763124958546
1041
+ - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda
1042
+ sha256: 289861ed0c13a15d7bbb408796af4de72c2fe67e2bcb0de98f4c3fce259d7991
1043
+ md5: 58335b26c38bf4a20f399384c33cbcf9
1044
+ depends:
1045
+ - python >=3.8
1046
+ - python
1047
+ license: Apache-2.0
1048
+ license_family: APACHE
1049
+ size: 62477
1050
+ timestamp: 1745345660407
1051
  - conda: https://conda.anaconda.org/conda-forge/noarch/parso-0.8.5-pyhcf101f3_0.conda
1052
  sha256: 30de7b4d15fbe53ffe052feccde31223a236dae0495bab54ab2479de30b2990f
1053
  md5: a110716cdb11cf51482ff4000dc253d7
 
1209
  license_family: BSD
1210
  size: 889287
1211
  timestamp: 1750615908735
1212
+ - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda
1213
+ sha256: ba3b032fa52709ce0d9fd388f63d330a026754587a2f461117cac9ab73d8d0d8
1214
+ md5: 461219d1a5bd61342293efa2c0c90eac
1215
+ depends:
1216
+ - __unix
1217
+ - python >=3.9
1218
+ license: BSD-3-Clause
1219
+ license_family: BSD
1220
+ size: 21085
1221
+ timestamp: 1733217331982
1222
  - conda: https://conda.anaconda.org/conda-forge/osx-arm64/python-3.12.12-h18782d2_1_cpython.conda
1223
  build_number: 1
1224
  sha256: 626da9bb78459ce541407327d1e22ee673fd74e9103f1a0e0f4e3967ad0a23a7
 
1339
  license_family: GPL
1340
  size: 252359
1341
  timestamp: 1740379663071
1342
+ - conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhd8ed1ab_0.conda
1343
+ sha256: 8dc54e94721e9ab545d7234aa5192b74102263d3e704e6d0c8aa7008f2da2a7b
1344
+ md5: db0c6b99149880c8ba515cf4abe93ee4
1345
+ depends:
1346
+ - certifi >=2017.4.17
1347
+ - charset-normalizer >=2,<4
1348
+ - idna >=2.5,<4
1349
+ - python >=3.9
1350
+ - urllib3 >=1.21.1,<3
1351
+ constrains:
1352
+ - chardet >=3.0.2,<6
1353
+ license: Apache-2.0
1354
+ license_family: APACHE
1355
+ size: 59263
1356
+ timestamp: 1755614348400
1357
  - conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.2.0-pyhcf101f3_0.conda
1358
  sha256: edfb44d0b6468a8dfced728534c755101f06f1a9870a7ad329ec51389f16b086
1359
  md5: a247579d8a59931091b16a1e932bbed6
 
1524
  license_family: MIT
1525
  size: 20973
1526
  timestamp: 1760014679845
1527
+ - conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.1-pyhd8ed1ab_1.conda
1528
+ sha256: 11e2c85468ae9902d24a27137b6b39b4a78099806e551d390e394a8c34b48e40
1529
+ md5: 9efbfdc37242619130ea42b1cc4ed861
1530
+ depends:
1531
+ - colorama
1532
+ - python >=3.9
1533
+ license: MPL-2.0 or MIT
1534
+ size: 89498
1535
+ timestamp: 1735661472632
1536
  - conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda
1537
  sha256: f39a5620c6e8e9e98357507262a7869de2ae8cc07da8b7f84e517c9fd6c2b959
1538
  md5: 019a7385be9af33791c989871317e1ed
 
1615
  license: LicenseRef-Public-Domain
1616
  size: 122968
1617
  timestamp: 1742727099393
1618
+ - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.0-pyhd8ed1ab_0.conda
1619
+ sha256: 2b95dee46e9e7cfaaecb9cc7f3de70d4ce77a2a1aee4538da4bd1ab7a45c7f9f
1620
+ md5: de7372f43e63ff0876b4023b79b55e95
1621
+ depends:
1622
+ - backports.zstd >=1.0.0
1623
+ - brotli-python >=1.2.0
1624
+ - h2 >=4,<5
1625
+ - pysocks >=1.5.6,<2.0,!=1.5.7
1626
+ - python >=3.10
1627
+ license: MIT
1628
+ license_family: MIT
1629
+ size: 102983
1630
+ timestamp: 1764955468239
1631
  - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-0.38.0-pyh31011fe_0.conda
1632
  sha256: 32e637726fd7cfeb74058e829b116e17514d001846fef56d8c763ec9ec5ac887
1633
  md5: d3aa78bc38d9478e9eed5f128ba35f41
 
1714
  license_family: MIT
1715
  size: 83386
1716
  timestamp: 1753484079473
1717
+ - conda: https://conda.anaconda.org/conda-forge/osx-arm64/zstd-1.5.7-hbf9d68e_6.conda
1718
+ sha256: 9485ba49e8f47d2b597dd399e88f4802e100851b27c21d7525625b0b4025a5d9
1719
+ md5: ab136e4c34e97f34fb621d2592a393d8
1720
+ depends:
1721
+ - __osx >=11.0
1722
+ - libzlib >=1.3.1,<2.0a0
1723
+ license: BSD-3-Clause
1724
+ license_family: BSD
1725
+ size: 433413
1726
+ timestamp: 1764777166076
pixi.toml CHANGED
@@ -17,3 +17,4 @@ numpy = ">=2.3.5,<3"
17
  fasttext = ">=0.9.2,<0.10"
18
  joblib = ">=1.5.2,<2"
19
  ipython = ">=9.8.0,<10"
 
 
17
  fasttext = ">=0.9.2,<0.10"
18
  joblib = ">=1.5.2,<2"
19
  ipython = ">=9.8.0,<10"
20
+ huggingface_hub = ">=1.2.1,<2"
src/do_pca_on_tfidf.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import polars as pl
2
+ from sklearn.decomposition import TruncatedSVD
3
+ from huggingface_hub import hf_hub_download
4
+ import numpy as np
5
+ from joblib import load
6
+ import scipy
7
+ import fasttext
8
+
9
+ # define the device where torch calculations take place
10
+ my_device = "mps"
11
+
12
+ # load the fasttext model
13
+ fasttext_model = fasttext.load_model(hf_hub_download("facebook/fasttext-en-vectors", "model.bin"))
14
+
15
+ # load the TF-IDF and DTM
16
+ my_df = pl.read_csv("outputs/TF-IDF-doc-text.csv")
17
+ my_vectorizer = load("outputs/tfidf_vectorizer_doc_text.joblib")
18
+
19
+ # vocab embeddings:
20
+ my_vocabulary = my_vectorizer.get_feature_names_out()
21
+ vocab_embeddings = np.array([fasttext_model.get_word_vector(term) for term in my_vocabulary])
22
+ keep_terms = [any(vocab_embeddings[i,] != 0) for i in range(vocab_embeddings.shape[0])]
23
+
24
+ # drop terms that have no embeddings in the fasttext model:
25
+ vocab_embeddings = vocab_embeddings[keep_terms, :]
26
+ my_vocabulary = my_vocabulary[keep_terms]
27
+
28
+ # calculate length of each embedding vector
29
+ vocab_norm = vocab_embeddings / np.linalg.norm(vocab_embeddings, axis=1, keepdims=True)
30
+
31
+ # get the document-term matrix and project it to 300 pseudo-topics.
32
+ doc_term_mat = my_df.select(pl.exclude(["file"]))[:,keep_terms]
33
+ dtm_svd = TruncatedSVD(n_components=300)
34
+ X_svd = dtm_svd.fit_transform(doc_term_mat)
35
+
36
+ def query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration = 10 ):
37
+ # query embeddings:
38
+ query_embeddings = np.array([fasttext_model.get_word_vector(term) for term in query.split()])
39
+
40
+ # Normalize rows
41
+ query_norm = query_embeddings / np.linalg.norm(query_embeddings, axis=1, keepdims=True)
42
+
43
+ # Compute cosine similarity matrix
44
+ query_similarities = np.dot(query_norm, vocab_norm.T)
45
+ query_tfidf = np.reshape(my_vectorizer.idf_[keep_terms], shape=(-1, vocab_norm.shape[0])) * scipy.special.softmax(query_similarities * concentration, axis = 1)
46
+ query_weights = dtm_svd.transform(query_tfidf)
47
+
48
+ # calculate the average TF-IDF score of the query over topics:
49
+ mean_query_score = np.sum(np.mean(query_weights, axis=0) * dtm_svd_mat, axis=1)
50
+
51
+ sorted_df = pl.DataFrame(
52
+ {
53
+ 'score-tfidf': mean_query_score,
54
+ 'file':my_df['file']
55
+ }).sort("score-tfidf", descending = True).with_columns(pl.Series("rank-tfidf", [i + 1 for i in range(len(mean_query_score))]))
56
+
57
+ #top_df['file'][0]
58
+ return(sorted_df)
59
+
60
+
61
+
62
+ def query_factory(dtm_svd, dtm_svd_mat, vocab_norm, concentration = 10):
63
+ def do_query(query):
64
+ return query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration)
65
+
66
+ return do_query
67
+
68
+ query_docs = query_factory(dtm_svd = dtm_svd, dtm_svd_mat = X_svd, vocab_norm=vocab_norm, concentration = 30)
69
+
70
+ res_tfidf = query_docs(query)
src/encode.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ #
4
+ def encode(sentences, tokenizer, model, device="mps"):
5
+ inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(device = device)
6
+
7
+ with torch.no_grad():
8
+ outputs = model(**inputs)
9
+
10
+ # outputs.last_hidden_state = [batch, tokens, hidden_dim]
11
+ # mean pooling
12
+ embeddings = outputs.last_hidden_state.mean(dim=1)
13
+
14
+ return(embeddings)
src/search-embeddings.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import packages
2
+ import numpy as np
3
+ import polars as pl
4
+
5
+ from encode import encode
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+
8
+ from transformers import AutoTokenizer, AutoModel
9
+
10
+ import glob
11
+
12
+
13
+ # define the device where torch calculations take place
14
+ my_device = "mps"
15
+
16
+ # Instantiate the sentence-transformer model:
17
+ model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
18
+ sentence_tokenizer = AutoTokenizer.from_pretrained(model_name)
19
+ sentence_model = AutoModel.from_pretrained(model_name).to(device = my_device)
20
+
21
+ # import the block embeddings
22
+ prefix = "outputs/block-embeddings-"
23
+ files = glob.glob(prefix + "*") # matches data_*, data_file.txt, data_123.csv, etc.
24
+
25
+ block_embeddings_list = list()
26
+ for filename in files:
27
+ print("Reading:", filename)
28
+ block_embeddings_list.append(pl.read_csv(filename))
29
+
30
+ block_embeddings_df = pl.concat(block_embeddings_list, how = 'vertical')
31
+
32
+ def sbert_query(query, corpus_embeddings_df):
33
+ query_embeddings = encode(query, tokenizer = sentence_tokenizer, model = sentence_model).cpu().numpy()
34
+
35
+ sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx'])))
36
+
37
+ sorted_df = pl.DataFrame(
38
+ {
39
+ 'score': np.reshape(sbert_scores, shape=-1),
40
+ 'file': corpus_embeddings_df['file'],
41
+ 'doc_block_indx': corpus_embeddings_df['doc_block_indx']
42
+ }).group_by("file").agg(pl.col("score").max())
43
+
44
+ #top_df['file'][0]
45
+ return(sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])])))
46
+
47
+ def sbert_query_factory(corpus_embeddings_df):
48
+ def do_sbert_query(my_query):
49
+ return sbert_query(my_query, corpus_embeddings_df)
50
+
51
+ return do_sbert_query
52
+
53
+
54
+ # create a function to run the SBERT queries
55
+ sbert_query_docs = sbert_query_factory(block_embeddings_df)
56
+
57
+ query = "plans for raising grant revenue directed to the libraries"
58
+ res_sbert = sbert_query_docs(query)
59
+
60
+
61
+ #res.group_by("file").agg(pl.col("rank").min(), pl.col("score").max()).sort("rank")
62
+
63
+
64
+
65
+
66
+
67
+