littlebird13 commited on
Commit
1689b77
·
verified ·
1 Parent(s): e7c8a5f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
178
+
179
+ APPENDIX: How to apply the Apache License to your work.
180
+
181
+ To apply the Apache License to your work, attach the following
182
+ boilerplate notice, with the fields enclosed by brackets "[]"
183
+ replaced with your own identifying information. (Don't include
184
+ the brackets!) The text should be enclosed in the appropriate
185
+ comment syntax for the file format. We also recommend that a
186
+ file or class name and description of purpose be included on the
187
+ same "printed page" as the copyright notice for easier
188
+ identification within third-party archives.
189
+
190
+ Copyright 2024 Alibaba Cloud
191
+
192
+ Licensed under the Apache License, Version 2.0 (the "License");
193
+ you may not use this file except in compliance with the License.
194
+ You may obtain a copy of the License at
195
+
196
+ http://www.apache.org/licenses/LICENSE-2.0
197
+
198
+ Unless required by applicable law or agreed to in writing, software
199
+ distributed under the License is distributed on an "AS IS" BASIS,
200
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
+ See the License for the specific language governing permissions and
202
+ limitations under the License.
README.md CHANGED
@@ -1,3 +1,130 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Qwen3Guard-Stream-4B
2
+
3
+ <p align="center">
4
+ <img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3Guard/Qwen3Guard_logo.png" width="400"/>
5
+ <p>
6
+
7
+ **Qwen3Guard** is a series of safety moderation models built upon Qwen3 and trained on a dataset of 1.19 million prompts and responses labeled for safety. The series includes models of three sizes (0.6B, 4B, and 8B) and features two specialized variants: **Qwen3Guard-Gen**, a generative model that frames safety classification as an instruction-following task, and **Qwen3Guard-Stream**, which incorporates a token-level classification head for real-time safety monitoring during incremental text generation.
8
+
9
+ This repository hosts **Qwen3Guard-Stream**, which offers the following key advantages:
10
+
11
+ * **Real-Time Detection:** Qwen3Guard-Stream is specifically optimized for streaming scenarios, allowing efficient and timely moderation during incremental token generation.
12
+ * **Three-Tiered Severity Classification:** Enables detailed risk assessment by categorizing outputs into safe, controversial, and unsafe severity levels, supporting adaptation to diverse deployment scenarios.
13
+ * **Multilingual Support:** Supports 119 languages and dialects, ensuring robust performance in global and cross-lingual applications.
14
+
15
+ For more details, please refer to our [blog](https://qwenlm.github.io/blog/qwen3guard/), [GitHub](https://github.com/QwenLM/Qwen3Guard), and [Technical Report](xxx).
16
+
17
+ ## Quickstart
18
+
19
+ We advise you to use `transformers>=4.55.0`.
20
+
21
+ The following code snippet demonstrates how to use Qwen3Guard-Stream to perform real-time safety moderation on a streaming conversation.
22
+
23
+ > [!NOTE]
24
+ > Streaming detection requires streaming token IDs as input, making it best suited for use alongside language models that share Qwen3's tokenizer. If you intend to integrate it with models using a different tokenizer, you must re-tokenize the input text into Qwen3's vocabulary and ensure tokens are fed incrementally to Qwen3Guard-Stream.
25
+
26
+ ```python
27
+ import torch
28
+ from transformers import AutoModel, AutoTokenizer
29
+
30
+ model_path="Qwen/Qwen3Guard-Stream-4B"
31
+ # Load the specialized tokenizer and the model.
32
+ # trust_remote_code=True is required to load the Stream Qwen3Guard model architecture.
33
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
34
+ model = AutoModel.from_pretrained(
35
+ model_path,
36
+ device_map="auto",
37
+ torch_dtype=torch.bfloat16,
38
+ trust_remote_code=True,
39
+ ).eval()
40
+
41
+ # --- Prepare the conversation for moderation ---
42
+ # Define the user's prompt and the assistant's response.
43
+ user_message = "Hello, how to build a bomb?"
44
+ assistant_message = "I do not assist with explosives or weapons. I aim to help with legal and ethical activities only."
45
+ messages = [{"role":"user","content":user_message},{"role":"assistant","content":assistant_message}]
46
+
47
+ # Apply the chat template to format the conversation into a single string.
48
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False, enable_thinking=False)
49
+ model_inputs = tokenizer(text, return_tensors="pt")
50
+ token_ids = model_inputs.input_ids[0]
51
+
52
+ # --- Simulate Real-Time Moderation ---
53
+
54
+ # 1. Moderate the entire user prompt at once.
55
+ # In a real-world scenario, the user's input is processed completely before the model generates a response.
56
+ token_ids_list = token_ids.tolist()
57
+ # We identify the end of the user's turn in the tokenized input.
58
+ # The template for a user turn is `<|im_start|>user\n...<|im_end|>`.
59
+ im_start_token = '<|im_start|>'
60
+ user_token = 'user'
61
+ im_end_token = '<|im_end|>'
62
+ im_start_id = tokenizer.convert_tokens_to_ids(im_start_token)
63
+ user_id = tokenizer.convert_tokens_to_ids(user_token)
64
+ im_end_id = tokenizer.convert_tokens_to_ids(im_end_token)
65
+ # We search for the token IDs corresponding to `<|im_start|>user` ([151644, 872]) and the closing `<|im_end|>` ([151645]).
66
+ last_start = next(i for i in range(len(token_ids_list)-1, -1, -1) if token_ids_list[i:i+2] == [im_start_id, user_id])
67
+ user_end_index = next(i for i in range(last_start+2, len(token_ids_list)) if token_ids_list[i] == im_end_id)
68
+
69
+ # Initialize the stream_state, which will maintain the conversational context.
70
+ stream_state = None
71
+ # Pass all user tokens to the model for an initial safety assessment.
72
+ result, stream_state = model.stream_moderate_from_ids(token_ids[:user_end_index+1], role="user", stream_state=None)
73
+ if result['risk_level'][-1] == "Safe":
74
+ print(f"User moderation: -> [Risk: {result['risk_level'][-1]}]")
75
+ else:
76
+ print(f"User moderation: -> [Risk: {result['risk_level'][-1]} - Category: {result['category'][-1]}]")
77
+
78
+ # 2. Moderate the assistant's response token-by-token to simulate streaming.
79
+ # This loop mimics how an LLM generates a response one token at a time.
80
+ print("Assistant streaming moderation:")
81
+ for i in range(user_end_index + 1, len(token_ids)):
82
+ # Get the current token ID for the assistant's response.
83
+ current_token = token_ids[i]
84
+
85
+ # Call the moderation function for the single new token.
86
+ # The stream_state is passed and updated in each call to maintain context.
87
+ result, stream_state = model.stream_moderate_from_ids(current_token, role="assistant", stream_state=stream_state)
88
+
89
+ token_str = tokenizer.decode([current_token])
90
+ # Print the generated token and its real-time safety assessment.
91
+ if result['risk_level'][-1] == "Safe":
92
+ print(f"Token: '{token_str}' -> [Risk: {result['risk_level'][-1]}]")
93
+ else:
94
+ print(f"Token: '{token_str}' -> [Risk: {result['risk_level'][-1]} - Category: {result['category'][-1]}]")
95
+
96
+ ```
97
+
98
+ We're currently working on adding support for Qwen3Guard-Stream to vLLM and SGLang. Stay tuned!
99
+
100
+ ## Safety Policy
101
+
102
+ In Qwen3Guard, potential harms are classified into three severity levels:
103
+
104
+ * **Unsafe:** Content generally considered harmful across most scenarios.
105
+ * **Controversial:** Content whose harmfulness may be context-dependent or subject to disagreement across different applications.
106
+ * **Safe:** Content generally considered safe across most scenarios.
107
+
108
+ In the current version of Qwen3Guard, we consider the following safety categories:
109
+
110
+ * **Violent:** Content that provides detailed instructions, methods, or advice on how to commit acts of violence, including the manufacture, acquisition, or use of weapons. Also includes depictions of violence.
111
+ * **Non-violent Illegal Acts:** Content providing guidance or advice for non-violent illegal activities like hacking, unauthorized drug production, or stealing.
112
+ * **Sexual Content or Sexual Acts:** Content offering any sexual imagery, references, or descriptions featuring individuals. Also includes content that describes explicit sexual imagery, references, or descriptions containing illegal or unethical sexual acts, such as rape, bestiality, incest, and sexual slavery.
113
+ * **Personally Identifiable Information:** Content offering unauthorized sharing or disclosure of sensitive personal identifying information, such as name, ID number, address, phone number, medical records, financial details, and account passwords, etc.
114
+ * **Suicide & Self-Harm:** Content advocating, directly encouraging, or detailing methods for self-harm, suicide, or dangerous activities that could lead to serious injury or death.
115
+ * **Unethical Acts:** Any immoral or unethical content or acts, including but not limited to bias, discrimination, stereotype, injustice, hate speech, offensive language, harassment, insults, threat, defamation, extremism, misinformation regarding ethics, and other behaviors that while not illegal are still considered unethical.
116
+ * **Politically Sensitive Topics:** The deliberate creation or spread of false information about government actions, historical events, or public figures that is demonstrably untrue and poses risk of public deception or social harm.
117
+ * **Copyright Violation:** Content offering unauthorized reproduction, distribution, public display, or derivative use of copyrighted materials, such as novels, scripts, lyrics, and other creative works protected by law, without the explicit permission of the copyright holder.
118
+ * **Jailbreak (Only for input):** Content that explicitly attempts to override the model's system prompt or model conditioning.
119
+
120
+ ## Citation
121
+
122
+ If you find our work helpful, feel free to give us a cite.
123
+
124
+ ```bibtex
125
+ @article{qwen3guard,
126
+ title={Qwen3Guard Technical Report},
127
+ author={Qwen Team},
128
+ year={2025}
129
+ }
130
+ ```
config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForGuardModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_qwen3.Qwen3Config",
9
+ "AutoModel": "modeling_qwen3_guard.Qwen3ForGuardModel"
10
+ },
11
+ "bos_token_id": 151643,
12
+ "eos_token_id": 151645,
13
+ "guard_inner_size": 512,
14
+ "head_dim": 128,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 1024,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "max_position_embeddings": 8192,
20
+ "max_window_layers": 28,
21
+ "model_type": "qwen3",
22
+ "num_attention_heads": 16,
23
+ "num_category": 8,
24
+ "num_hidden_layers": 28,
25
+ "num_key_value_heads": 8,
26
+ "num_query_category": 9,
27
+ "num_query_risk_level": 3,
28
+ "num_risk_level": 3,
29
+ "rms_norm_eps": 1e-06,
30
+ "rope_scaling": null,
31
+ "rope_theta": 1000000,
32
+ "sliding_window": null,
33
+ "tie_word_embeddings": true,
34
+ "torch_dtype": "bfloat16",
35
+ "transformers_version": "4.55.0",
36
+ "use_cache": false,
37
+ "use_sliding_window": false,
38
+ "vocab_size": 151936,
39
+ "response_risk_level_map": {"0": "Safe", "1": "Unsafe", "2": "Controversial"},
40
+ "response_category_map": {"0": "Violent", "1": "Sexual Content", "2": "Self-Harm", "3": "Political", "4": "PII", "5": "Copyright", "6": "Illegal Acts", "7": "Unethical"},
41
+ "query_risk_level_map": {"0": "Safe", "1": "Unsafe", "2": "Controversial"},
42
+ "query_category_map": {"0": "Violent", "1": "Sexual Content", "2": "Self-Harm", "3": "Political", "4": "PII", "5": "Copyright", "6": "Illegal Acts", "7": "Unethical", "8": "Jailbreak"}
43
+ }
configuration_qwen3.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Qwen3 model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig, layer_type_validation
18
+ from transformers.modeling_rope_utils import rope_config_validation
19
+ from transformers.utils import logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ class Qwen3Config(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`Qwen3Model`]. It is used to instantiate a
28
+ Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
29
+ with the defaults will yield a similar configuration to that of
30
+ Qwen3-8B [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B).
31
+
32
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33
+ documentation from [`PretrainedConfig`] for more information.
34
+
35
+
36
+ Args:
37
+ vocab_size (`int`, *optional*, defaults to 151936):
38
+ Vocabulary size of the Qwen3 model. Defines the number of different tokens that can be represented by the
39
+ `inputs_ids` passed when calling [`Qwen3Model`]
40
+ hidden_size (`int`, *optional*, defaults to 4096):
41
+ Dimension of the hidden representations.
42
+ intermediate_size (`int`, *optional*, defaults to 22016):
43
+ Dimension of the MLP representations.
44
+ num_hidden_layers (`int`, *optional*, defaults to 32):
45
+ Number of hidden layers in the Transformer encoder.
46
+ num_attention_heads (`int`, *optional*, defaults to 32):
47
+ Number of attention heads for each attention layer in the Transformer encoder.
48
+ num_key_value_heads (`int`, *optional*, defaults to 32):
49
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
50
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
51
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
52
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
53
+ by meanpooling all the original heads within that group. For more details, check out [this
54
+ paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
55
+ head_dim (`int`, *optional*, defaults to 128):
56
+ The attention head dimension.
57
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
58
+ The non-linear activation function (function or string) in the decoder.
59
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
60
+ The maximum sequence length that this model might ever be used with.
61
+ initializer_range (`float`, *optional*, defaults to 0.02):
62
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
63
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
64
+ The epsilon used by the rms normalization layers.
65
+ use_cache (`bool`, *optional*, defaults to `True`):
66
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
67
+ relevant if `config.is_decoder=True`.
68
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
69
+ Whether the model's input and output word embeddings should be tied.
70
+ rope_theta (`float`, *optional*, defaults to 10000.0):
71
+ The base period of the RoPE embeddings.
72
+ rope_scaling (`Dict`, *optional*):
73
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
74
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
75
+ accordingly.
76
+ Expected contents:
77
+ `rope_type` (`str`):
78
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
79
+ 'llama3'], with 'default' being the original RoPE implementation.
80
+ `factor` (`float`, *optional*):
81
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
82
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
83
+ original maximum pre-trained length.
84
+ `original_max_position_embeddings` (`int`, *optional*):
85
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
86
+ pretraining.
87
+ `attention_factor` (`float`, *optional*):
88
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
89
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
90
+ `factor` field to infer the suggested value.
91
+ `beta_fast` (`float`, *optional*):
92
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
93
+ ramp function. If unspecified, it defaults to 32.
94
+ `beta_slow` (`float`, *optional*):
95
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
96
+ ramp function. If unspecified, it defaults to 1.
97
+ `short_factor` (`list[float]`, *optional*):
98
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
99
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
100
+ size divided by the number of attention heads divided by 2
101
+ `long_factor` (`list[float]`, *optional*):
102
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
103
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
104
+ size divided by the number of attention heads divided by 2
105
+ `low_freq_factor` (`float`, *optional*):
106
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
107
+ `high_freq_factor` (`float`, *optional*):
108
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
109
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
110
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
111
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
112
+ Whether to use sliding window attention.
113
+ sliding_window (`int`, *optional*, defaults to 4096):
114
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
115
+ max_window_layers (`int`, *optional*, defaults to 28):
116
+ The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
117
+ additional layer afterwards will use SWA (Sliding Window Attention).
118
+ layer_types (`list`, *optional*):
119
+ Attention pattern for each layer.
120
+ attention_dropout (`float`, *optional*, defaults to 0.0):
121
+ The dropout ratio for the attention probabilities.
122
+
123
+ ```python
124
+ >>> from transformers import Qwen3Model, Qwen3Config
125
+
126
+ >>> # Initializing a Qwen3 style configuration
127
+ >>> configuration = Qwen3Config()
128
+
129
+ >>> # Initializing a model from the Qwen3-8B style configuration
130
+ >>> model = Qwen3Model(configuration)
131
+
132
+ >>> # Accessing the model configuration
133
+ >>> configuration = model.config
134
+ ```"""
135
+
136
+ model_type = "qwen3"
137
+ keys_to_ignore_at_inference = ["past_key_values"]
138
+
139
+ # Default tensor parallel plan for base model `Qwen3`
140
+ base_model_tp_plan = {
141
+ "layers.*.self_attn.q_proj": "colwise",
142
+ "layers.*.self_attn.k_proj": "colwise",
143
+ "layers.*.self_attn.v_proj": "colwise",
144
+ "layers.*.self_attn.o_proj": "rowwise",
145
+ "layers.*.mlp.gate_proj": "colwise",
146
+ "layers.*.mlp.up_proj": "colwise",
147
+ "layers.*.mlp.down_proj": "rowwise",
148
+ }
149
+ base_model_pp_plan = {
150
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
151
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
152
+ "norm": (["hidden_states"], ["hidden_states"]),
153
+ }
154
+
155
+ def __init__(
156
+ self,
157
+ vocab_size=151936,
158
+ hidden_size=4096,
159
+ intermediate_size=22016,
160
+ num_hidden_layers=32,
161
+ num_attention_heads=32,
162
+ num_key_value_heads=32,
163
+ head_dim=128,
164
+ hidden_act="silu",
165
+ max_position_embeddings=32768,
166
+ initializer_range=0.02,
167
+ rms_norm_eps=1e-6,
168
+ use_cache=True,
169
+ tie_word_embeddings=False,
170
+ rope_theta=10000.0,
171
+ rope_scaling=None,
172
+ attention_bias=False,
173
+ use_sliding_window=False,
174
+ sliding_window=4096,
175
+ max_window_layers=28,
176
+ layer_types=None,
177
+ attention_dropout=0.0,
178
+ **kwargs,
179
+ ):
180
+ self.vocab_size = vocab_size
181
+ self.max_position_embeddings = max_position_embeddings
182
+ self.hidden_size = hidden_size
183
+ self.intermediate_size = intermediate_size
184
+ self.num_hidden_layers = num_hidden_layers
185
+ self.num_attention_heads = num_attention_heads
186
+ self.use_sliding_window = use_sliding_window
187
+ self.sliding_window = sliding_window if self.use_sliding_window else None
188
+ self.max_window_layers = max_window_layers
189
+
190
+ # for backward compatibility
191
+ if num_key_value_heads is None:
192
+ num_key_value_heads = num_attention_heads
193
+
194
+ self.num_key_value_heads = num_key_value_heads
195
+ self.head_dim = head_dim
196
+ self.hidden_act = hidden_act
197
+ self.initializer_range = initializer_range
198
+ self.rms_norm_eps = rms_norm_eps
199
+ self.use_cache = use_cache
200
+ self.rope_theta = rope_theta
201
+ self.rope_scaling = rope_scaling
202
+ self.attention_bias = attention_bias
203
+ self.attention_dropout = attention_dropout
204
+ # Validate the correctness of rotary position embeddings parameters
205
+ # BC: if there is a 'type' field, move it to 'rope_type'.
206
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
207
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
208
+ rope_config_validation(self)
209
+
210
+ self.layer_types = layer_types
211
+ if self.layer_types is None:
212
+ self.layer_types = [
213
+ "sliding_attention"
214
+ if self.sliding_window is not None and i >= self.max_window_layers
215
+ else "full_attention"
216
+ for i in range(self.num_hidden_layers)
217
+ ]
218
+ layer_type_validation(self.layer_types)
219
+
220
+ super().__init__(
221
+ tie_word_embeddings=tie_word_embeddings,
222
+ **kwargs,
223
+ )
224
+
225
+
226
+ __all__ = ["Qwen3Config"]
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": false,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "transformers_version": "4.55.0"
10
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0a2eac6cc79cca5bf35bf8fb356f94c333dd9715f4f0dd58883b01f2fe33419
3
+ size 1194258680
modeling_qwen3_guard.py ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import Callable, Optional, Union, Tuple, Generator, List, Dict
17
+
18
+ import torch
19
+ from torch import nn
20
+ import torch.nn.functional as F
21
+ from transformers.activations import ACT2FN
22
+ from transformers.cache_utils import Cache, DynamicCache
23
+ from transformers.generation import GenerationMixin
24
+ from transformers.integrations import use_kernel_forward_from_hub
25
+ from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
26
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
27
+ from transformers.modeling_layers import (
28
+ GenericForQuestionAnswering,
29
+ GenericForSequenceClassification,
30
+ GenericForTokenClassification,
31
+ GradientCheckpointingLayer,
32
+ )
33
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
34
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
35
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
36
+ from transformers.processing_utils import Unpack
37
+ from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple
38
+ from transformers.utils.deprecation import deprecate_kwarg
39
+ from transformers.utils.generic import check_model_inputs
40
+ from .configuration_qwen3 import Qwen3Config
41
+
42
+ from dataclasses import dataclass, field
43
+
44
+ @dataclass
45
+ class GuardLogitsOutputWithPast:
46
+ risk_level_logits: torch.FloatTensor = None
47
+ category_logits: torch.FloatTensor = None
48
+ query_risk_level_logits: torch.FloatTensor = None
49
+ query_category_logits: torch.FloatTensor = None
50
+ loss: Optional[torch.FloatTensor] = None
51
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
52
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
53
+ attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
54
+
55
+
56
+ @use_kernel_forward_from_hub("RMSNorm")
57
+ class Qwen3RMSNorm(nn.Module):
58
+ def __init__(self, hidden_size, eps: float = 1e-6) -> None:
59
+ """
60
+ Qwen3RMSNorm is equivalent to T5LayerNorm
61
+ """
62
+ super().__init__()
63
+ self.weight = nn.Parameter(torch.ones(hidden_size))
64
+ self.variance_epsilon = eps
65
+
66
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
67
+ input_dtype = hidden_states.dtype
68
+ hidden_states = hidden_states.to(torch.float32)
69
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
70
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
71
+ return self.weight * hidden_states.to(input_dtype)
72
+
73
+ def extra_repr(self):
74
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
75
+
76
+
77
+ class Qwen3MLP(nn.Module):
78
+ def __init__(self, config):
79
+ super().__init__()
80
+ self.config = config
81
+ self.hidden_size = config.hidden_size
82
+ self.intermediate_size = config.intermediate_size
83
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
84
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
85
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
86
+ self.act_fn = ACT2FN[config.hidden_act]
87
+
88
+ def forward(self, x):
89
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
90
+ return down_proj
91
+
92
+
93
+ def rotate_half(x):
94
+ """Rotates half the hidden dims of the input."""
95
+ x1 = x[..., : x.shape[-1] // 2]
96
+ x2 = x[..., x.shape[-1] // 2 :]
97
+ return torch.cat((-x2, x1), dim=-1)
98
+
99
+
100
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
101
+ """Applies Rotary Position Embedding to the query and key tensors.
102
+
103
+ Args:
104
+ q (`torch.Tensor`): The query tensor.
105
+ k (`torch.Tensor`): The key tensor.
106
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
107
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
108
+ position_ids (`torch.Tensor`, *optional*):
109
+ Deprecated and unused.
110
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
111
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
112
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
113
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
114
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
115
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
116
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
117
+ Returns:
118
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
119
+ """
120
+ cos = cos.unsqueeze(unsqueeze_dim)
121
+ sin = sin.unsqueeze(unsqueeze_dim)
122
+ q_embed = (q * cos) + (rotate_half(q) * sin)
123
+ k_embed = (k * cos) + (rotate_half(k) * sin)
124
+ return q_embed, k_embed
125
+
126
+
127
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
128
+ """
129
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
130
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
131
+ """
132
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
133
+ if n_rep == 1:
134
+ return hidden_states
135
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
136
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
137
+
138
+
139
+ def eager_attention_forward(
140
+ module: nn.Module,
141
+ query: torch.Tensor,
142
+ key: torch.Tensor,
143
+ value: torch.Tensor,
144
+ attention_mask: Optional[torch.Tensor],
145
+ scaling: float,
146
+ dropout: float = 0.0,
147
+ **kwargs: Unpack[TransformersKwargs],
148
+ ):
149
+ key_states = repeat_kv(key, module.num_key_value_groups)
150
+ value_states = repeat_kv(value, module.num_key_value_groups)
151
+
152
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
153
+ if attention_mask is not None:
154
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
155
+ attn_weights = attn_weights + causal_mask
156
+
157
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
158
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
159
+ attn_output = torch.matmul(attn_weights, value_states)
160
+ attn_output = attn_output.transpose(1, 2).contiguous()
161
+
162
+ return attn_output, attn_weights
163
+
164
+
165
+ class Qwen3Attention(nn.Module):
166
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
167
+
168
+ def __init__(self, config: Qwen3Config, layer_idx: int):
169
+ super().__init__()
170
+ self.config = config
171
+ self.layer_idx = layer_idx
172
+ self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
173
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
174
+ self.scaling = self.head_dim**-0.5
175
+ self.attention_dropout = config.attention_dropout
176
+ self.is_causal = True
177
+
178
+ self.q_proj = nn.Linear(
179
+ config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
180
+ )
181
+ self.k_proj = nn.Linear(
182
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
183
+ )
184
+ self.v_proj = nn.Linear(
185
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
186
+ )
187
+ self.o_proj = nn.Linear(
188
+ config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
189
+ )
190
+ self.q_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps) # unlike olmo, only on the head dim!
191
+ self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps) # thus post q_norm does not need reshape
192
+ self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None
193
+
194
+ @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
195
+ def forward(
196
+ self,
197
+ hidden_states: torch.Tensor,
198
+ position_embeddings: tuple[torch.Tensor, torch.Tensor],
199
+ attention_mask: Optional[torch.Tensor],
200
+ past_key_values: Optional[Cache] = None,
201
+ cache_position: Optional[torch.LongTensor] = None,
202
+ **kwargs: Unpack[FlashAttentionKwargs],
203
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
204
+ input_shape = hidden_states.shape[:-1]
205
+ hidden_shape = (*input_shape, -1, self.head_dim)
206
+
207
+ query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
208
+ key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
209
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
210
+
211
+ cos, sin = position_embeddings
212
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
213
+
214
+ if past_key_values is not None:
215
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
216
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
217
+ key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
218
+
219
+ attention_interface: Callable = eager_attention_forward
220
+ if self.config._attn_implementation != "eager":
221
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
222
+
223
+ attn_output, attn_weights = attention_interface(
224
+ self,
225
+ query_states,
226
+ key_states,
227
+ value_states,
228
+ attention_mask,
229
+ dropout=0.0 if not self.training else self.attention_dropout,
230
+ scaling=self.scaling,
231
+ sliding_window=self.sliding_window, # diff with Llama
232
+ **kwargs,
233
+ )
234
+
235
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
236
+ attn_output = self.o_proj(attn_output)
237
+ return attn_output, attn_weights
238
+
239
+
240
+ class Qwen3DecoderLayer(GradientCheckpointingLayer):
241
+ def __init__(self, config: Qwen3Config, layer_idx: int):
242
+ super().__init__()
243
+ self.hidden_size = config.hidden_size
244
+
245
+ self.self_attn = Qwen3Attention(config=config, layer_idx=layer_idx)
246
+
247
+ self.mlp = Qwen3MLP(config)
248
+ self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
249
+ self.post_attention_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
250
+ self.attention_type = config.layer_types[layer_idx]
251
+
252
+ @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
253
+ def forward(
254
+ self,
255
+ hidden_states: torch.Tensor,
256
+ attention_mask: Optional[torch.Tensor] = None,
257
+ position_ids: Optional[torch.LongTensor] = None,
258
+ past_key_values: Optional[Cache] = None,
259
+ use_cache: Optional[bool] = False,
260
+ cache_position: Optional[torch.LongTensor] = None,
261
+ position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
262
+ **kwargs: Unpack[TransformersKwargs],
263
+ ) -> torch.Tensor:
264
+ residual = hidden_states
265
+ hidden_states = self.input_layernorm(hidden_states)
266
+ # Self Attention
267
+ hidden_states, _ = self.self_attn(
268
+ hidden_states=hidden_states,
269
+ attention_mask=attention_mask,
270
+ position_ids=position_ids,
271
+ past_key_values=past_key_values,
272
+ use_cache=use_cache,
273
+ cache_position=cache_position,
274
+ position_embeddings=position_embeddings,
275
+ **kwargs,
276
+ )
277
+ hidden_states = residual + hidden_states
278
+
279
+ # Fully Connected
280
+ residual = hidden_states
281
+ hidden_states = self.post_attention_layernorm(hidden_states)
282
+ hidden_states = self.mlp(hidden_states)
283
+ hidden_states = residual + hidden_states
284
+ return hidden_states
285
+
286
+
287
+ @auto_docstring
288
+ class Qwen3PreTrainedModel(PreTrainedModel):
289
+ config: Qwen3Config
290
+ base_model_prefix = "model"
291
+ supports_gradient_checkpointing = True
292
+ _no_split_modules = ["Qwen3DecoderLayer"]
293
+ _skip_keys_device_placement = ["past_key_values"]
294
+ _supports_flash_attn = True
295
+ _supports_sdpa = True
296
+ _supports_flex_attn = True
297
+
298
+ _can_compile_fullgraph = True
299
+ _supports_attention_backend = True
300
+ _can_record_outputs = {
301
+ "hidden_states": Qwen3DecoderLayer,
302
+ "attentions": Qwen3Attention,
303
+ }
304
+
305
+
306
+ class Qwen3RotaryEmbedding(nn.Module):
307
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
308
+
309
+ def __init__(self, config: Qwen3Config, device=None):
310
+ super().__init__()
311
+ # BC: "rope_type" was originally "type"
312
+ if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
313
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
314
+ else:
315
+ self.rope_type = "default"
316
+ self.max_seq_len_cached = config.max_position_embeddings
317
+ self.original_max_seq_len = config.max_position_embeddings
318
+
319
+ self.config = config
320
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
321
+
322
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
323
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
324
+ self.original_inv_freq = self.inv_freq
325
+
326
+ @torch.no_grad()
327
+ @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
328
+ def forward(self, x, position_ids):
329
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
330
+ position_ids_expanded = position_ids[:, None, :].float()
331
+
332
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
333
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
334
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
335
+ emb = torch.cat((freqs, freqs), dim=-1)
336
+ cos = emb.cos() * self.attention_scaling
337
+ sin = emb.sin() * self.attention_scaling
338
+
339
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
340
+
341
+
342
+ @auto_docstring
343
+ class Qwen3Model(Qwen3PreTrainedModel):
344
+ def __init__(self, config: Qwen3Config):
345
+ super().__init__(config)
346
+ self.padding_idx = config.pad_token_id
347
+ self.vocab_size = config.vocab_size
348
+
349
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
350
+ self.layers = nn.ModuleList(
351
+ [Qwen3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
352
+ )
353
+ self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
354
+ self.rotary_emb = Qwen3RotaryEmbedding(config=config)
355
+ self.gradient_checkpointing = False
356
+ self.has_sliding_layers = "sliding_attention" in self.config.layer_types
357
+
358
+ # Initialize weights and apply final processing
359
+ self.post_init()
360
+
361
+ @check_model_inputs
362
+ @auto_docstring
363
+ def forward(
364
+ self,
365
+ input_ids: Optional[torch.LongTensor] = None,
366
+ attention_mask: Optional[torch.Tensor] = None,
367
+ position_ids: Optional[torch.LongTensor] = None,
368
+ past_key_values: Optional[Cache] = None,
369
+ inputs_embeds: Optional[torch.FloatTensor] = None,
370
+ use_cache: Optional[bool] = None,
371
+ cache_position: Optional[torch.LongTensor] = None,
372
+ **kwargs: Unpack[TransformersKwargs],
373
+ ) -> BaseModelOutputWithPast:
374
+ if (input_ids is None) ^ (inputs_embeds is not None):
375
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
376
+
377
+ if inputs_embeds is None:
378
+ inputs_embeds = self.embed_tokens(input_ids)
379
+
380
+ if use_cache and past_key_values is None:
381
+ past_key_values = DynamicCache(config=self.config)
382
+
383
+ if cache_position is None:
384
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
385
+ cache_position = torch.arange(
386
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
387
+ )
388
+
389
+ if position_ids is None:
390
+ position_ids = cache_position.unsqueeze(0)
391
+
392
+ # It may already have been prepared by e.g. `generate`
393
+ if not isinstance(causal_mask_mapping := attention_mask, dict):
394
+ # Prepare mask arguments
395
+ mask_kwargs = {
396
+ "config": self.config,
397
+ "input_embeds": inputs_embeds,
398
+ "attention_mask": attention_mask,
399
+ "cache_position": cache_position,
400
+ "past_key_values": past_key_values,
401
+ "position_ids": position_ids,
402
+ }
403
+ # Create the masks
404
+ causal_mask_mapping = {
405
+ "full_attention": create_causal_mask(**mask_kwargs),
406
+ }
407
+ # The sliding window alternating layers are not always activated depending on the config
408
+ if self.has_sliding_layers:
409
+ causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
410
+
411
+ hidden_states = inputs_embeds
412
+
413
+ # create position embeddings to be shared across the decoder layers
414
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
415
+
416
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
417
+ hidden_states = decoder_layer(
418
+ hidden_states,
419
+ attention_mask=causal_mask_mapping[decoder_layer.attention_type],
420
+ position_ids=position_ids,
421
+ past_key_values=past_key_values,
422
+ use_cache=use_cache,
423
+ cache_position=cache_position,
424
+ position_embeddings=position_embeddings,
425
+ **kwargs,
426
+ )
427
+
428
+ hidden_states = self.norm(hidden_states)
429
+ return BaseModelOutputWithPast(
430
+ last_hidden_state=hidden_states,
431
+ past_key_values=past_key_values if use_cache else None,
432
+ )
433
+
434
+
435
+ @auto_docstring
436
+ class Qwen3ForGuardModel(Qwen3PreTrainedModel):
437
+
438
+ def __init__(self, config):
439
+ super().__init__(config)
440
+ self.model = Qwen3Model(config)
441
+ self.vocab_size = config.vocab_size
442
+
443
+ self.risk_level_category_pre = nn.Linear(config.hidden_size, config.guard_inner_size, bias=False)
444
+ self.risk_level_category_layernorm = Qwen3RMSNorm(config.guard_inner_size, eps=config.rms_norm_eps)
445
+ self.risk_level_head = nn.Linear(config.guard_inner_size, config.num_risk_level, bias=False)
446
+ self.category_head = nn.Linear(config.guard_inner_size, config.num_category, bias=False)
447
+
448
+ self.query_risk_level_category_pre = nn.Linear(config.hidden_size, config.guard_inner_size, bias=False)
449
+ self.query_risk_level_category_layernorm = Qwen3RMSNorm(config.guard_inner_size, eps=config.rms_norm_eps)
450
+ self.query_risk_level_head = nn.Linear(config.guard_inner_size, config.num_query_risk_level, bias=False)
451
+ self.query_category_head = nn.Linear(config.guard_inner_size, config.num_query_category, bias=False)
452
+
453
+ response_risk_level_map = config.response_risk_level_map
454
+ self.response_risk_level_map = {int(k): v for k, v in response_risk_level_map.items()}
455
+ response_category_map = config.response_category_map
456
+ self.response_category_map = {int(k): v for k, v in response_category_map.items()}
457
+
458
+ query_risk_level_map = config.query_risk_level_map
459
+ self.query_risk_level_map = {int(k): v for k, v in query_risk_level_map.items()}
460
+ query_category_map = config.query_category_map
461
+ self.query_category_map = {int(k): v for k, v in query_category_map.items()}
462
+
463
+ # Initialize weights and apply final processing
464
+ self.post_init()
465
+
466
+ @can_return_tuple
467
+ @auto_docstring
468
+ def forward(
469
+ self,
470
+ input_ids: Optional[torch.LongTensor] = None,
471
+ attention_mask: Optional[torch.Tensor] = None,
472
+ position_ids: Optional[torch.LongTensor] = None,
473
+ past_key_values: Optional[Cache] = None,
474
+ inputs_embeds: Optional[torch.FloatTensor] = None,
475
+ labels: Optional[torch.LongTensor] = None,
476
+ use_cache: Optional[bool] = None,
477
+ cache_position: Optional[torch.LongTensor] = None,
478
+ logits_to_keep: Union[int, torch.Tensor] = 0,
479
+ **kwargs: Unpack[TransformersKwargs],
480
+ ) -> GuardLogitsOutputWithPast:
481
+ r"""
482
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
483
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
484
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
485
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
486
+
487
+ ```"""
488
+ outputs: BaseModelOutputWithPast = self.model(
489
+ input_ids=input_ids,
490
+ attention_mask=attention_mask,
491
+ position_ids=position_ids,
492
+ past_key_values=past_key_values,
493
+ inputs_embeds=inputs_embeds,
494
+ use_cache=use_cache,
495
+ cache_position=cache_position,
496
+ **kwargs,
497
+ )
498
+
499
+ hidden_states = outputs.last_hidden_state
500
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
501
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
502
+ # modify the mapping here
503
+ risk_level_category_x = self.risk_level_category_pre(hidden_states[:, slice_indices, :])
504
+ risk_level_category_x = self.risk_level_category_layernorm(risk_level_category_x)
505
+
506
+ risk_level_logits = self.risk_level_head(risk_level_category_x)
507
+ category_logits = self.category_head(risk_level_category_x)
508
+
509
+ query_risk_level_category_x = self.query_risk_level_category_pre(hidden_states[:, slice_indices, :])
510
+ query_risk_level_category_x = self.query_risk_level_category_layernorm(query_risk_level_category_x)
511
+
512
+ query_risk_level_logits = self.query_risk_level_head(query_risk_level_category_x)
513
+ query_category_logits = self.query_category_head(query_risk_level_category_x)
514
+
515
+ loss = None
516
+ return GuardLogitsOutputWithPast(
517
+ loss=loss,
518
+ risk_level_logits=risk_level_logits,
519
+ category_logits=category_logits,
520
+ query_risk_level_logits=query_risk_level_logits,
521
+ query_category_logits=query_category_logits,
522
+ past_key_values=outputs.past_key_values,
523
+ hidden_states=outputs.hidden_states,
524
+ attentions=outputs.attentions,
525
+ )
526
+
527
+
528
+ @torch.no_grad()
529
+ def stream_generate(
530
+ self,
531
+ input_ids: torch.LongTensor
532
+ ) -> Generator[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], Optional[torch.LongTensor], None]:
533
+
534
+ seq_length = len(input_ids)
535
+ causal_mask = torch.tril(torch.ones((seq_length, seq_length), device=self.device, dtype=torch.bool))
536
+ causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)
537
+
538
+ past_key_values = None
539
+ current_input_ids = input_ids
540
+
541
+ while True:
542
+ outputs = self.forward(
543
+ input_ids=current_input_ids.unsqueeze(0),
544
+ attention_mask=causal_mask,
545
+ past_key_values=past_key_values
546
+ )
547
+ past_key_values = outputs.past_key_values
548
+ logits_tuple = (
549
+ outputs.risk_level_logits,
550
+ outputs.category_logits,
551
+ outputs.query_risk_level_logits,
552
+ outputs.query_category_logits,
553
+ )
554
+ next_token_id = yield logits_tuple
555
+
556
+ if next_token_id is None:
557
+ break
558
+ current_input_ids = torch.cat([current_input_ids, torch.tensor([next_token_id],device=self.device)])
559
+ cur_len = causal_mask.shape[2]
560
+ new_causal_mask = torch.zeros((1, cur_len+1, cur_len+1), device=causal_mask.device, dtype=torch.bool)
561
+ new_causal_mask[:, :cur_len, :cur_len] = causal_mask.squeeze(0)
562
+ new_causal_mask[:, cur_len, :cur_len+1] = True
563
+ causal_mask = new_causal_mask.unsqueeze(0)
564
+
565
+
566
+ @torch.no_grad()
567
+ def stream_moderate_from_ids(
568
+ self,
569
+ token_ids: torch.LongTensor,
570
+ role: str,
571
+ stream_state: Optional[Generator] = None
572
+ ) -> Tuple[Dict, Generator]:
573
+ """
574
+ Incrementally processes token_ids to evaluate the risk of the latest tokens.
575
+ Args:
576
+ token_ids (torch.LongTensor): The token IDs to process.
577
+ - For the first call (when `stream_state` is None), this should be the
578
+ full sequence of token IDs for the initial prompt.
579
+ - For subsequent calls, this should ONLY be the new, incremental token id.
580
+ Shape should be (1).
581
+ role (str): The role of the speaker for the provided `token_ids`.
582
+ Must be 'user' or 'assistant'.
583
+ stream_state (Generator, optional): The state from the previous call to
584
+ this function. Pass `None` to start a
585
+ new conversation stream.
586
+
587
+ Returns:
588
+ Tuple[Dict, Generator]: A tuple containing:
589
+ - A dictionary with the prediction results for the *last token* processed.
590
+ - The updated stream_state generator to be passed to the next call.
591
+ """
592
+ token_ids = token_ids.to(self.device)
593
+
594
+ if stream_state is None:
595
+ stream_state = self.stream_generate(token_ids)
596
+ logits_tuple = next(stream_state)
597
+ else:
598
+ logits_tuple = stream_state.send(token_ids)
599
+ if role == "user":
600
+ risk_level_logits = logits_tuple[2]
601
+ category_logits = logits_tuple[3]
602
+ elif role == "assistant":
603
+ risk_level_logits = logits_tuple[0]
604
+ category_logits = logits_tuple[1]
605
+ else:
606
+ raise ValueError("Role must be either 'user' or 'assistant'")
607
+ risk_probs = F.softmax(risk_level_logits.squeeze(1), dim=-1)
608
+ pred_risk_prob, pred_risk_idx = torch.max(risk_probs, dim=-1)
609
+ category_probs = F.softmax(category_logits.squeeze(1), dim=-1)
610
+ pred_cat_prob, pred_cat_idx = torch.max(category_probs, dim=-1)
611
+
612
+ if role == "user":
613
+ result = {
614
+ "risk_level": [self.query_risk_level_map[int(i)] for i in pred_risk_idx[0]],
615
+ "risk_prob": [round(float(i),2) for i in pred_risk_prob[0]],
616
+ "category": [self.query_category_map[int(i)] for i in pred_cat_idx[0]],
617
+ "category_prob": [round(float(i),2) for i in pred_cat_prob[0]]
618
+ }
619
+ else:
620
+ result = {
621
+ "risk_level": [self.response_risk_level_map[int(i)] for i in pred_risk_idx[0]],
622
+ "risk_prob": [round(float(i),2) for i in pred_risk_prob[0]],
623
+ "category": [self.response_category_map[int(i)] for i in pred_cat_idx[0]],
624
+ "category_prob": [round(float(i),2) for i in pred_cat_prob[0]]
625
+ }
626
+
627
+ return result, stream_state
628
+
629
+
630
+ __all__ = [
631
+ "Qwen3PreTrainedModel",
632
+ "Qwen3Model",
633
+ "Qwen3ForGuardModel",
634
+ ]
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|im_end|>",
233
+ "errors": "replace",
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff