yusenthebot commited on
Commit
aa3fdef
·
1 Parent(s): 902e65e

Integrate advanced OCR, FlashcardGenerator, DifficultyScorer, and AI Quiz from language project

Browse files
data/cefr/cefr_words.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "CEFR (Common European Framework of Reference)",
3
+ "description": "Proficiency levels for European languages",
4
+ "languages": ["en", "de", "es", "fr", "it"],
5
+ "source": "Sample data - Replace with complete CEFR database for production",
6
+ "levels": {
7
+ "A1": {
8
+ "description": "Beginner",
9
+ "score": 1,
10
+ "en": ["hello", "goodbye", "yes", "no", "please", "thank you", "water", "food", "house", "cat", "dog", "book", "table", "chair", "good", "bad", "big", "small", "one", "two"],
11
+ "de": ["hallo", "tschüss", "ja", "nein", "bitte", "danke", "wasser", "essen", "haus", "katze", "hund", "buch", "tisch", "stuhl", "gut", "schlecht", "groß", "klein", "eins", "zwei"],
12
+ "es": ["hola", "adiós", "sí", "no", "por favor", "gracias", "agua", "comida", "casa", "gato", "perro", "libro", "mesa", "silla", "bueno", "malo", "grande", "pequeño", "uno", "dos"]
13
+ },
14
+ "A2": {
15
+ "description": "Elementary",
16
+ "score": 2,
17
+ "en": ["restaurant", "morning", "afternoon", "week", "month", "family", "friend", "work", "school", "city", "country", "weather", "summer", "winter", "happy", "sad", "easy", "difficult", "beautiful", "expensive"],
18
+ "de": ["restaurant", "morgen", "nachmittag", "woche", "monat", "familie", "freund", "arbeit", "schule", "stadt", "land", "wetter", "sommer", "winter", "glücklich", "traurig", "einfach", "schwierig", "schön", "teuer"],
19
+ "es": ["restaurante", "mañana", "tarde", "semana", "mes", "familia", "amigo", "trabajo", "escuela", "ciudad", "país", "tiempo", "verano", "invierno", "feliz", "triste", "fácil", "difícil", "hermoso", "caro"]
20
+ },
21
+ "B1": {
22
+ "description": "Intermediate",
23
+ "score": 3,
24
+ "en": ["experience", "environment", "situation", "knowledge", "relationship", "government", "education", "opportunity", "responsibility", "technology", "culture", "society", "development", "economy", "necessary", "available", "significant", "traditional", "generally", "particularly"],
25
+ "de": ["erfahrung", "umwelt", "situation", "wissen", "beziehung", "regierung", "bildung", "gelegenheit", "verantwortung", "technologie", "kultur", "gesellschaft", "entwicklung", "wirtschaft", "notwendig", "verfügbar", "bedeutend", "traditionell", "allgemein", "besonders"],
26
+ "es": ["experiencia", "medio ambiente", "situación", "conocimiento", "relación", "gobierno", "educación", "oportunidad", "responsabilidad", "tecnología", "cultura", "sociedad", "desarrollo", "economía", "necesario", "disponible", "significativo", "tradicional", "generalmente", "particularmente"]
27
+ },
28
+ "B2": {
29
+ "description": "Upper Intermediate",
30
+ "score": 4,
31
+ "en": ["implementation", "infrastructure", "phenomenon", "component", "perspective", "theoretical", "comprehensive", "substantial", "predominantly", "furthermore", "nevertheless", "consequently", "regarding", "whereas", "thereby"],
32
+ "de": ["umsetzung", "infrastruktur", "phänomen", "komponente", "perspektive", "theoretisch", "umfassend", "wesentlich", "vorwiegend", "außerdem", "dennoch", "folglich", "bezüglich", "wohingegen", "dadurch"],
33
+ "es": ["implementación", "infraestructura", "fenómeno", "componente", "perspectiva", "teórico", "comprensivo", "sustancial", "predominantemente", "además", "sin embargo", "consecuentemente", "respecto a", "mientras que", "por lo tanto"]
34
+ },
35
+ "C1": {
36
+ "description": "Advanced",
37
+ "score": 5,
38
+ "en": ["aforementioned", "notwithstanding", "juxtapose", "paradigm", "methodology", "hypothesis", "empirical", "ambiguous", "intrinsic", "exacerbate", "unprecedented", "inadvertent", "inherent", "albeit", "albeit"],
39
+ "de": ["oben erwähnt", "ungeachtet", "gegenüberstellen", "paradigma", "methodologie", "hypothese", "empirisch", "mehrdeutig", "intrinsisch", "verschärfen", "beispiellos", "unbeabsichtigt", "inhärent", "obwohl", "obgleich"],
40
+ "es": ["mencionado anteriormente", "no obstante", "yuxtaponer", "paradigma", "metodología", "hipótesis", "empírico", "ambiguo", "intrínseco", "exacerbar", "sin precedentes", "inadvertido", "inherente", "aunque", "si bien"]
41
+ },
42
+ "C2": {
43
+ "description": "Proficient",
44
+ "score": 6,
45
+ "en": ["epistemological", "quintessential", "perspicacious", "ubiquitous", "vicissitude", "surreptitious", "obfuscate", "indefatigable", "recalcitrant", "acquiesce"],
46
+ "de": ["erkenntnistheoretisch", "wesentlich", "scharfsinnig", "allgegenwärtig", "wandel", "heimlich", "verschleiern", "unermüdlich", "widerspenstig", "einwilligen"],
47
+ "es": ["epistemológico", "quintaesencial", "perspicaz", "ubicuo", "vicisitud", "subrepticio", "ofuscar", "infatigable", "recalcitrante", "consentir"]
48
+ }
49
+ }
50
+ }
data/cefr && cp CUsers13197OneDriveDesktopproject2languagedatahskhsk_words.json CUsers13197OneDriveDesktop24697Project2agentic-language-partnerdatahsk ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "CEFR (Common European Framework of Reference)",
3
+ "description": "Proficiency levels for European languages",
4
+ "languages": ["en", "de", "es", "fr", "it"],
5
+ "source": "Sample data - Replace with complete CEFR database for production",
6
+ "levels": {
7
+ "A1": {
8
+ "description": "Beginner",
9
+ "score": 1,
10
+ "en": ["hello", "goodbye", "yes", "no", "please", "thank you", "water", "food", "house", "cat", "dog", "book", "table", "chair", "good", "bad", "big", "small", "one", "two"],
11
+ "de": ["hallo", "tschüss", "ja", "nein", "bitte", "danke", "wasser", "essen", "haus", "katze", "hund", "buch", "tisch", "stuhl", "gut", "schlecht", "groß", "klein", "eins", "zwei"],
12
+ "es": ["hola", "adiós", "sí", "no", "por favor", "gracias", "agua", "comida", "casa", "gato", "perro", "libro", "mesa", "silla", "bueno", "malo", "grande", "pequeño", "uno", "dos"]
13
+ },
14
+ "A2": {
15
+ "description": "Elementary",
16
+ "score": 2,
17
+ "en": ["restaurant", "morning", "afternoon", "week", "month", "family", "friend", "work", "school", "city", "country", "weather", "summer", "winter", "happy", "sad", "easy", "difficult", "beautiful", "expensive"],
18
+ "de": ["restaurant", "morgen", "nachmittag", "woche", "monat", "familie", "freund", "arbeit", "schule", "stadt", "land", "wetter", "sommer", "winter", "glücklich", "traurig", "einfach", "schwierig", "schön", "teuer"],
19
+ "es": ["restaurante", "mañana", "tarde", "semana", "mes", "familia", "amigo", "trabajo", "escuela", "ciudad", "país", "tiempo", "verano", "invierno", "feliz", "triste", "fácil", "difícil", "hermoso", "caro"]
20
+ },
21
+ "B1": {
22
+ "description": "Intermediate",
23
+ "score": 3,
24
+ "en": ["experience", "environment", "situation", "knowledge", "relationship", "government", "education", "opportunity", "responsibility", "technology", "culture", "society", "development", "economy", "necessary", "available", "significant", "traditional", "generally", "particularly"],
25
+ "de": ["erfahrung", "umwelt", "situation", "wissen", "beziehung", "regierung", "bildung", "gelegenheit", "verantwortung", "technologie", "kultur", "gesellschaft", "entwicklung", "wirtschaft", "notwendig", "verfügbar", "bedeutend", "traditionell", "allgemein", "besonders"],
26
+ "es": ["experiencia", "medio ambiente", "situación", "conocimiento", "relación", "gobierno", "educación", "oportunidad", "responsabilidad", "tecnología", "cultura", "sociedad", "desarrollo", "economía", "necesario", "disponible", "significativo", "tradicional", "generalmente", "particularmente"]
27
+ },
28
+ "B2": {
29
+ "description": "Upper Intermediate",
30
+ "score": 4,
31
+ "en": ["implementation", "infrastructure", "phenomenon", "component", "perspective", "theoretical", "comprehensive", "substantial", "predominantly", "furthermore", "nevertheless", "consequently", "regarding", "whereas", "thereby"],
32
+ "de": ["umsetzung", "infrastruktur", "phänomen", "komponente", "perspektive", "theoretisch", "umfassend", "wesentlich", "vorwiegend", "außerdem", "dennoch", "folglich", "bezüglich", "wohingegen", "dadurch"],
33
+ "es": ["implementación", "infraestructura", "fenómeno", "componente", "perspectiva", "teórico", "comprensivo", "sustancial", "predominantemente", "además", "sin embargo", "consecuentemente", "respecto a", "mientras que", "por lo tanto"]
34
+ },
35
+ "C1": {
36
+ "description": "Advanced",
37
+ "score": 5,
38
+ "en": ["aforementioned", "notwithstanding", "juxtapose", "paradigm", "methodology", "hypothesis", "empirical", "ambiguous", "intrinsic", "exacerbate", "unprecedented", "inadvertent", "inherent", "albeit", "albeit"],
39
+ "de": ["oben erwähnt", "ungeachtet", "gegenüberstellen", "paradigma", "methodologie", "hypothese", "empirisch", "mehrdeutig", "intrinsisch", "verschärfen", "beispiellos", "unbeabsichtigt", "inhärent", "obwohl", "obgleich"],
40
+ "es": ["mencionado anteriormente", "no obstante", "yuxtaponer", "paradigma", "metodología", "hipótesis", "empírico", "ambiguo", "intrínseco", "exacerbar", "sin precedentes", "inadvertido", "inherente", "aunque", "si bien"]
41
+ },
42
+ "C2": {
43
+ "description": "Proficient",
44
+ "score": 6,
45
+ "en": ["epistemological", "quintessential", "perspicacious", "ubiquitous", "vicissitude", "surreptitious", "obfuscate", "indefatigable", "recalcitrant", "acquiesce"],
46
+ "de": ["erkenntnistheoretisch", "wesentlich", "scharfsinnig", "allgegenwärtig", "wandel", "heimlich", "verschleiern", "unermüdlich", "widerspenstig", "einwilligen"],
47
+ "es": ["epistemológico", "quintaesencial", "perspicaz", "ubicuo", "vicisitud", "subrepticio", "ofuscar", "infatigable", "recalcitrante", "consentir"]
48
+ }
49
+ }
50
+ }
data/hsk/hsk_words.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "HSK (Hanyu Shuiping Kaoshi)",
3
+ "description": "Chinese Proficiency Test",
4
+ "language": "zh-cn",
5
+ "source": "Sample data - Replace with complete HSK database (~5000 words) for production",
6
+ "levels": {
7
+ "1": {
8
+ "description": "Beginner",
9
+ "score": 1,
10
+ "words": [
11
+ "你", "我", "他", "她", "们", "的", "是", "不", "了", "在",
12
+ "有", "人", "这", "中", "大", "来", "上", "国", "个", "到",
13
+ "说", "时", "要", "就", "出", "会", "可", "也", "你们", "我们",
14
+ "他们", "什么", "没有", "好", "看", "爱", "去", "想", "做", "吃",
15
+ "喝", "饭", "茶", "水", "书", "字", "学", "生", "先生", "小姐"
16
+ ]
17
+ },
18
+ "2": {
19
+ "description": "Elementary",
20
+ "score": 2,
21
+ "words": [
22
+ "能", "过", "现在", "没关系", "太", "非常", "怎么", "怎么样", "知道",
23
+ "道", "学习", "认识", "高兴", "欢迎", "谢谢", "对不起", "再见", "明天", "昨天",
24
+ "今天", "年", "月", "日", "星期", "点", "分", "小时", "刚才",
25
+ "已经", "马上", "打电话", "跑步", "睡觉", "起床", "上班", "下班", "飞机", "火车",
26
+ "公共汽车", "出租车", "医院", "银行", "邮局", "超市", "商店", "饭馆", "学校", "公司"
27
+ ]
28
+ },
29
+ "3": {
30
+ "description": "Intermediate",
31
+ "score": 3,
32
+ "words": [
33
+ "但是", "因为", "所以", "虽然", "如果", "或者", "而且", "然后", "才", "刚",
34
+ "曾经", "从来", "一直", "正在", "着", "过", "了", "地", "得",
35
+ "把", "被", "让", "叫", "使", "教", "告诉", "问", "回答", "说话",
36
+ "聊天", "讨论", "解释", "介绍", "表示", "表达", "意思", "意见", "建议", "办法",
37
+ "方法", "态度", "情况", "问题", "困难", "容易", "简单", "复杂", "重要", "必要"
38
+ ]
39
+ },
40
+ "4": {
41
+ "description": "Upper Intermediate",
42
+ "score": 4,
43
+ "words": [
44
+ "麻婆豆腐", "番茄炒蛋", "宫保鸡丁", "鱼香肉丝", "醋溜白菜", "蛋炒饭", "辣子鸡", "酸辣土豆丝",
45
+ "饺子", "包子", "馒头", "面条", "米饭", "粥", "汤", "菜",
46
+ "总是", "经常", "有时候", "偶尔", "从不", "永远", "始终", "一直",
47
+ "特别", "比较", "更", "最", "极其",
48
+ "关于", "对于", "至于", "由于", "根据", "按照", "依照", "为了",
49
+ "除了", "除非", "即使", "尽管", "不管"
50
+ ]
51
+ },
52
+ "5": {
53
+ "description": "Advanced",
54
+ "score": 5,
55
+ "words": [
56
+ "龟兔赛跑", "画蛇添足", "守株待兔", "刻舟求剑", "亡羊补牢", "掩耳盗铃", "狐假虎威", "井底之蛙",
57
+ "完全", "彻底", "绝对", "肯定", "否定", "确定", "一定", "必然",
58
+ "偶然", "突然", "忽然", "顿时", "立刻", "随即", "随后",
59
+ "继续", "持续", "连续", "陆续", "依次", "逐渐", "渐渐", "逐步",
60
+ "促进", "推动", "推进", "加强", "增强", "提高", "改善", "完善"
61
+ ]
62
+ },
63
+ "6": {
64
+ "description": "Proficient",
65
+ "score": 6,
66
+ "words": [
67
+ "跑得很远了", "宝宝睡前故事", "嘲笑", "比赛", "撒开", "腿", "一会儿",
68
+ "深刻", "深入", "深远", "深厚", "深切", "深度", "深层", "深化",
69
+ "广泛", "广大", "广阔", "宽广", "宽阔", "辽阔", "浩瀚", "无限",
70
+ "精确", "准确", "正确", "确切", "切实", "实际", "实在", "实质",
71
+ "综合", "综述", "概括", "概述", "总结", "归纳", "归结", "归类"
72
+ ]
73
+ }
74
+ }
75
+ }
data/jlpt/jlpt_words.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "JLPT (Japanese Language Proficiency Test)",
3
+ "description": "Proficiency levels for Japanese language",
4
+ "language": "ja",
5
+ "source": "Sample data - Replace with complete JLPT database for production",
6
+ "note": "N5 is easiest, N1 is hardest",
7
+ "levels": {
8
+ "N5": {
9
+ "description": "Beginner",
10
+ "score": 1,
11
+ "words": [
12
+ "こんにちは", "ありがとう", "すみません", "はい", "いいえ",
13
+ "私", "あなた", "これ", "それ", "あれ",
14
+ "水", "食べ物", "家", "学校", "本",
15
+ "大きい", "小さい", "良い", "悪い", "新しい",
16
+ "一", "二", "三", "四", "五"
17
+ ]
18
+ },
19
+ "N4": {
20
+ "description": "Elementary",
21
+ "score": 2,
22
+ "words": [
23
+ "レストラン", "駅", "病院", "図書館", "公園",
24
+ "朝", "昼", "夜", "今日", "明日",
25
+ "友達", "先生", "学生", "会社", "仕事",
26
+ "便利", "簡単", "難しい", "面白い", "つまらない",
27
+ "食べる", "飲む", "行く", "来る", "見る"
28
+ ]
29
+ },
30
+ "N3": {
31
+ "description": "Intermediate",
32
+ "score": 3,
33
+ "words": [
34
+ "経験", "環境", "状況", "知識", "関係",
35
+ "政府", "教育", "機会", "責任", "技術",
36
+ "文化", "社会", "発展", "経済", "必要",
37
+ "一般的", "特に", "確かに", "実際", "最近",
38
+ "考える", "思う", "感じる", "理解する", "説明する"
39
+ ]
40
+ },
41
+ "N2": {
42
+ "description": "Upper Intermediate",
43
+ "score": 4,
44
+ "words": [
45
+ "実施", "基盤", "現象", "要素", "観点",
46
+ "理論的", "包括的", "実質的", "主に", "さらに",
47
+ "しかしながら", "従って", "に関して", "一方", "それによって",
48
+ "検討する", "分析する", "評価する", "提案する", "実現する"
49
+ ]
50
+ },
51
+ "N1": {
52
+ "description": "Advanced",
53
+ "score": 5,
54
+ "words": [
55
+ "前述", "にもかかわらず", "対比する", "範例", "方法論",
56
+ "仮説", "経験的", "曖昧", "本質的", "悪化させる",
57
+ "前例のない", "無意識", "固有", "とはいえ", "けれども",
58
+ "体系化する", "統合する", "最適化する", "具現化する", "顕在化する"
59
+ ]
60
+ }
61
+ }
62
+ }
data/jlpt && cp CUsers13197OneDriveDesktopproject2languagedatatopiktopik_words.json CUsers13197OneDriveDesktop24697Project2agentic-language-partnerdatatopik ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "JLPT (Japanese Language Proficiency Test)",
3
+ "description": "Proficiency levels for Japanese language",
4
+ "language": "ja",
5
+ "source": "Sample data - Replace with complete JLPT database for production",
6
+ "note": "N5 is easiest, N1 is hardest",
7
+ "levels": {
8
+ "N5": {
9
+ "description": "Beginner",
10
+ "score": 1,
11
+ "words": [
12
+ "こんにちは", "ありがとう", "すみません", "はい", "いいえ",
13
+ "私", "あなた", "これ", "それ", "あれ",
14
+ "水", "食べ物", "家", "学校", "本",
15
+ "大きい", "小さい", "良い", "悪い", "新しい",
16
+ "一", "二", "三", "四", "五"
17
+ ]
18
+ },
19
+ "N4": {
20
+ "description": "Elementary",
21
+ "score": 2,
22
+ "words": [
23
+ "レストラン", "駅", "病院", "図書館", "公園",
24
+ "朝", "昼", "夜", "今日", "明日",
25
+ "友達", "先生", "学生", "会社", "仕事",
26
+ "便利", "簡単", "難しい", "面白い", "つまらない",
27
+ "食べる", "飲む", "行く", "来る", "見る"
28
+ ]
29
+ },
30
+ "N3": {
31
+ "description": "Intermediate",
32
+ "score": 3,
33
+ "words": [
34
+ "経験", "環境", "状況", "知識", "関係",
35
+ "政府", "教育", "機会", "責任", "技術",
36
+ "文化", "社会", "発展", "経済", "必要",
37
+ "一般的", "特に", "確かに", "実際", "最近",
38
+ "考える", "思う", "感じる", "理解する", "説明する"
39
+ ]
40
+ },
41
+ "N2": {
42
+ "description": "Upper Intermediate",
43
+ "score": 4,
44
+ "words": [
45
+ "実施", "基盤", "現象", "要素", "観点",
46
+ "理論的", "包括的", "実質的", "主に", "さらに",
47
+ "しかしながら", "従って", "に関して", "一方", "それによって",
48
+ "検討する", "分析する", "評価する", "提案する", "実現する"
49
+ ]
50
+ },
51
+ "N1": {
52
+ "description": "Advanced",
53
+ "score": 5,
54
+ "words": [
55
+ "前述", "にもかかわらず", "対比する", "範例", "方法論",
56
+ "仮説", "経験的", "曖昧", "本質的", "悪化させる",
57
+ "前例のない", "無意識", "固有", "とはいえ", "けれども",
58
+ "体系化する", "統合する", "最適化する", "具現化する", "顕在化する"
59
+ ]
60
+ }
61
+ }
62
+ }
data/topik/topik_words.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "TOPIK (Test of Proficiency in Korean)",
3
+ "description": "Proficiency levels for Korean language",
4
+ "language": "ko",
5
+ "source": "Sample data - Replace with complete TOPIK database for production",
6
+ "note": "TOPIK I (1-2) and TOPIK II (3-6)",
7
+ "levels": {
8
+ "1": {
9
+ "description": "Beginner",
10
+ "score": 1,
11
+ "words": [
12
+ "안녕하세요", "감사합니다", "죄송합니다", "네", "아니요",
13
+ "나", "너", "이것", "저것", "그것",
14
+ "물", "음식", "집", "학교", "책",
15
+ "크다", "작다", "좋다", "나쁘다", "새롭다",
16
+ "하나", "둘", "셋", "넷", "다섯"
17
+ ]
18
+ },
19
+ "2": {
20
+ "description": "Elementary",
21
+ "score": 2,
22
+ "words": [
23
+ "식당", "역", "병원", "도서관", "공원",
24
+ "아침", "점심", "저녁", "오늘", "내일",
25
+ "친구", "선생님", "학생", "회사", "일",
26
+ "편리하다", "쉽다", "어렵다", "재미있다", "지루하다",
27
+ "먹다", "마시다", "가다", "오다", "보다"
28
+ ]
29
+ },
30
+ "3": {
31
+ "description": "Intermediate",
32
+ "score": 3,
33
+ "words": [
34
+ "경험", "환경", "상황", "지식", "관계",
35
+ "정부", "교육", "기회", "책임", "기술",
36
+ "문화", "사회", "발전", "경제", "필요",
37
+ "일반적", "특히", "확실히", "실제로", "최근",
38
+ "생각하다", "느끼다", "이해하다", "설명하다", "표현하다"
39
+ ]
40
+ },
41
+ "4": {
42
+ "description": "Upper Intermediate",
43
+ "score": 4,
44
+ "words": [
45
+ "실시", "기반", "현상", "요소", "관점",
46
+ "이론적", "포괄적", "실질적", "주로", "더욱이",
47
+ "그러나", "따라서", "에 관하여", "한편", "그로써",
48
+ "검토하다", "분석하다", "평가하다", "제안하다", "실현하다"
49
+ ]
50
+ },
51
+ "5": {
52
+ "description": "Advanced",
53
+ "score": 5,
54
+ "words": [
55
+ "전술한", "에도 불구하고", "대비하다", "패러다임", "방법론",
56
+ "가설", "경험적", "애매한", "본질적", "악화시키다",
57
+ "전례없는", "무의식적", "고유한", "비록", "그럼에도",
58
+ "체계화하다", "통합하다", "최적화하다", "구현하다", "현현하다"
59
+ ]
60
+ },
61
+ "6": {
62
+ "description": "Proficient",
63
+ "score": 6,
64
+ "words": [
65
+ "인식론적", "전형적인", "예리한", "편재하는", "변천",
66
+ "은밀한", "난독화하다", "불굴의", "완고한", "동의하다"
67
+ ]
68
+ }
69
+ }
70
+ }
requirements.txt CHANGED
@@ -17,11 +17,13 @@ gTTS
17
  ########################################
18
  pytesseract
19
  pillow
 
20
 
21
  ########################################
22
- # Translation
23
  ########################################
24
  deep-translator
 
25
 
26
  ########################################
27
  # Language Modeling / Text Processing
@@ -32,6 +34,11 @@ sentencepiece
32
  safetensors
33
  regex
34
 
 
 
 
 
 
35
  ########################################
36
  # General Utilities
37
  ########################################
 
17
  ########################################
18
  pytesseract
19
  pillow
20
+ opencv-python-headless
21
 
22
  ########################################
23
+ # Translation & Language Detection
24
  ########################################
25
  deep-translator
26
+ langdetect
27
 
28
  ########################################
29
  # Language Modeling / Text Processing
 
34
  safetensors
35
  regex
36
 
37
+ ########################################
38
+ # AI APIs (Optional - for Quiz Generation)
39
+ ########################################
40
+ openai
41
+
42
  ########################################
43
  # General Utilities
44
  ########################################
src/app/difficulty_scorer.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Difficulty Scorer - Multi-language Support
4
+
5
+ Supports 6 languages with proficiency test databases:
6
+ - English (en): CEFR A1-C2
7
+ - Chinese (zh-cn): HSK 1-6
8
+ - German (de): CEFR A1-C2
9
+ - Spanish (es): CEFR A1-C2
10
+ - Japanese (ja): JLPT N5-N1
11
+ - Korean (ko): TOPIK 1-6
12
+ """
13
+
14
+ import json
15
+ from typing import Dict, Any, List, Optional
16
+ from pathlib import Path
17
+
18
+
19
+ class DifficultyScorer:
20
+ """Multi-language difficulty scoring system"""
21
+
22
+ LANGUAGE_TESTS = {
23
+ 'en': 'cefr',
24
+ 'de': 'cefr',
25
+ 'es': 'cefr',
26
+ 'fr': 'cefr',
27
+ 'it': 'cefr',
28
+ 'zh-cn': 'hsk',
29
+ 'zh-tw': 'hsk',
30
+ 'ja': 'jlpt',
31
+ 'ko': 'topik',
32
+ 'ru': 'cefr',
33
+ }
34
+
35
+ JLPT_MAPPING = {
36
+ 'N5': 1, 'N4': 2, 'N3': 3, 'N2': 4, 'N1': 5
37
+ }
38
+
39
+ def __init__(self, data_dir: str = None):
40
+ """
41
+ Initialize multi-language difficulty scorer
42
+
43
+ Args:
44
+ data_dir: Path to data directory containing proficiency databases
45
+ """
46
+ if data_dir is None:
47
+ current_dir = Path(__file__).parent
48
+ project_root = current_dir.parent.parent
49
+ data_dir = project_root / "data"
50
+
51
+ self.data_dir = Path(data_dir)
52
+ self.databases = self._load_all_databases()
53
+ self.word_lookups = self._create_word_lookups()
54
+
55
+ def _load_all_databases(self) -> Dict[str, Dict]:
56
+ """Load all language proficiency databases"""
57
+ databases = {}
58
+
59
+ # Load CEFR (English, German, Spanish, etc.)
60
+ cefr_path = self.data_dir / "cefr" / "cefr_words.json"
61
+ if cefr_path.exists():
62
+ try:
63
+ with open(cefr_path, 'r', encoding='utf-8') as f:
64
+ databases['cefr'] = json.load(f)
65
+ except Exception as e:
66
+ print(f"[DifficultyScorer] Failed to load CEFR: {e}")
67
+
68
+ # Load HSK (Chinese)
69
+ hsk_path = self.data_dir / "hsk" / "hsk_words.json"
70
+ if hsk_path.exists():
71
+ try:
72
+ with open(hsk_path, 'r', encoding='utf-8') as f:
73
+ databases['hsk'] = json.load(f)
74
+ except Exception as e:
75
+ print(f"[DifficultyScorer] Failed to load HSK: {e}")
76
+
77
+ # Load JLPT (Japanese)
78
+ jlpt_path = self.data_dir / "jlpt" / "jlpt_words.json"
79
+ if jlpt_path.exists():
80
+ try:
81
+ with open(jlpt_path, 'r', encoding='utf-8') as f:
82
+ databases['jlpt'] = json.load(f)
83
+ except Exception as e:
84
+ print(f"[DifficultyScorer] Failed to load JLPT: {e}")
85
+
86
+ # Load TOPIK (Korean)
87
+ topik_path = self.data_dir / "topik" / "topik_words.json"
88
+ if topik_path.exists():
89
+ try:
90
+ with open(topik_path, 'r', encoding='utf-8') as f:
91
+ databases['topik'] = json.load(f)
92
+ except Exception as e:
93
+ print(f"[DifficultyScorer] Failed to load TOPIK: {e}")
94
+
95
+ return databases
96
+
97
+ def _create_word_lookups(self) -> Dict[str, Dict[str, int]]:
98
+ """Create word-to-score lookup tables for all languages"""
99
+ lookups = {}
100
+
101
+ # CEFR lookups
102
+ if 'cefr' in self.databases:
103
+ cefr = self.databases['cefr']
104
+ for lang_code in ['en', 'de', 'es', 'fr', 'it', 'ru']:
105
+ lookups[lang_code] = {}
106
+ if 'levels' in cefr:
107
+ for level, data in cefr['levels'].items():
108
+ score = data.get('score', 3)
109
+ if lang_code in data:
110
+ for word in data[lang_code]:
111
+ lookups[lang_code][word.lower()] = score
112
+
113
+ # HSK lookup (Chinese)
114
+ if 'hsk' in self.databases:
115
+ lookups['zh-cn'] = {}
116
+ lookups['zh-tw'] = {}
117
+ if 'levels' in self.databases['hsk']:
118
+ for level, data in self.databases['hsk']['levels'].items():
119
+ score = data.get('score', 3)
120
+ for word in data.get('words', []):
121
+ lookups['zh-cn'][word] = score
122
+ lookups['zh-tw'][word] = score
123
+
124
+ # JLPT lookup (Japanese)
125
+ if 'jlpt' in self.databases:
126
+ lookups['ja'] = {}
127
+ if 'levels' in self.databases['jlpt']:
128
+ for level, data in self.databases['jlpt']['levels'].items():
129
+ score = data.get('score', 3)
130
+ for word in data.get('words', []):
131
+ lookups['ja'][word] = score
132
+
133
+ # TOPIK lookup (Korean)
134
+ if 'topik' in self.databases:
135
+ lookups['ko'] = {}
136
+ if 'levels' in self.databases['topik']:
137
+ for level, data in self.databases['topik']['levels'].items():
138
+ score = data.get('score', 3)
139
+ for word in data.get('words', []):
140
+ lookups['ko'][word] = score
141
+
142
+ return lookups
143
+
144
+ def get_proficiency_score(self, word: str, language: str) -> float:
145
+ """
146
+ Get proficiency test score for a word
147
+
148
+ Args:
149
+ word: Word or phrase
150
+ language: Language code
151
+
152
+ Returns:
153
+ Score 1-6 (1=easiest, 6=hardest)
154
+ """
155
+ language = language.lower()
156
+
157
+ if language not in self.word_lookups:
158
+ return self._estimate_by_length(word)
159
+
160
+ lookup = self.word_lookups[language]
161
+ search_word = word if language in ['zh-cn', 'zh-tw', 'ja', 'ko'] else word.lower()
162
+
163
+ if search_word in lookup:
164
+ return float(lookup[search_word])
165
+
166
+ return self._estimate_by_length(word)
167
+
168
+ def _estimate_by_length(self, word: str) -> float:
169
+ """Estimate difficulty by word length (fallback)"""
170
+ length = len(word)
171
+ if length <= 3:
172
+ return 2.0
173
+ elif length <= 6:
174
+ return 3.5
175
+ elif length <= 10:
176
+ return 4.5
177
+ else:
178
+ return 5.5
179
+
180
+ def get_length_score(self, word: str) -> float:
181
+ """Score based on word length"""
182
+ length = len(word)
183
+ if length == 1:
184
+ return 1.0
185
+ elif length <= 3:
186
+ return 2.0
187
+ elif length <= 6:
188
+ return 3.0
189
+ elif length <= 10:
190
+ return 4.0
191
+ elif length <= 15:
192
+ return 5.0
193
+ else:
194
+ return 6.0
195
+
196
+ def calculate_difficulty(self, word: str, language: str) -> Dict[str, Any]:
197
+ """
198
+ Calculate comprehensive difficulty score
199
+
200
+ Weights:
201
+ - Proficiency level: 60%
202
+ - Word length: 40%
203
+ """
204
+ proficiency_score = self.get_proficiency_score(word, language)
205
+ length_score = self.get_length_score(word)
206
+
207
+ overall_score = proficiency_score * 0.6 + length_score * 0.4
208
+
209
+ if overall_score <= 2.5:
210
+ level = "beginner"
211
+ elif overall_score <= 4.5:
212
+ level = "intermediate"
213
+ else:
214
+ level = "advanced"
215
+
216
+ test_name = self.LANGUAGE_TESTS.get(language.lower(), 'unknown')
217
+
218
+ return {
219
+ "overall_score": round(overall_score, 2),
220
+ "level": level,
221
+ "factors": {
222
+ "proficiency_score": round(proficiency_score, 2),
223
+ "length": len(word),
224
+ "length_score": round(length_score, 2),
225
+ "test_system": test_name.upper()
226
+ }
227
+ }
228
+
229
+ def score_flashcard(self, card: Dict[str, Any]) -> Dict[str, Any]:
230
+ """Add difficulty score to flashcard"""
231
+ word = card.get('front', '')
232
+ language = card.get('language', 'en')
233
+
234
+ difficulty = self.calculate_difficulty(word, language)
235
+
236
+ card_with_difficulty = card.copy()
237
+ card_with_difficulty['difficulty'] = difficulty
238
+
239
+ return card_with_difficulty
240
+
241
+ def score_all_flashcards(self, flashcards: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
242
+ """Score all flashcards"""
243
+ return [self.score_flashcard(card) for card in flashcards]
244
+
245
+ def get_statistics(self, flashcards: List[Dict[str, Any]]) -> Dict[str, Any]:
246
+ """Generate difficulty statistics"""
247
+ if not flashcards:
248
+ return {}
249
+
250
+ level_counts = {"beginner": 0, "intermediate": 0, "advanced": 0}
251
+ scores = []
252
+ by_language = {}
253
+
254
+ for card in flashcards:
255
+ if 'difficulty' in card:
256
+ level = card['difficulty']['level']
257
+ level_counts[level] += 1
258
+ scores.append(card['difficulty']['overall_score'])
259
+
260
+ lang = card.get('language', 'unknown')
261
+ if lang not in by_language:
262
+ by_language[lang] = {"count": 0, "scores": []}
263
+ by_language[lang]["count"] += 1
264
+ by_language[lang]["scores"].append(card['difficulty']['overall_score'])
265
+
266
+ for lang in by_language:
267
+ lang_scores = by_language[lang]["scores"]
268
+ by_language[lang]["avg_score"] = round(sum(lang_scores) / len(lang_scores), 2)
269
+ del by_language[lang]["scores"]
270
+
271
+ return {
272
+ "total_cards": len(flashcards),
273
+ "by_level": level_counts,
274
+ "by_language": by_language,
275
+ "average_score": round(sum(scores) / len(scores), 2) if scores else 0,
276
+ "min_score": round(min(scores), 2) if scores else 0,
277
+ "max_score": round(max(scores), 2) if scores else 0
278
+ }
279
+
280
+
281
+ # Global instance (lazy initialization)
282
+ _difficulty_scorer = None
283
+
284
+
285
+ def get_difficulty_scorer() -> DifficultyScorer:
286
+ """Get or create the global DifficultyScorer instance"""
287
+ global _difficulty_scorer
288
+ if _difficulty_scorer is None:
289
+ _difficulty_scorer = DifficultyScorer()
290
+ return _difficulty_scorer
src/app/flashcard_generator.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Flashcard Generator - Extracts vocabulary with context from OCR results
4
+ Supports multi-language extraction and context sentence generation
5
+ """
6
+
7
+ import json
8
+ import re
9
+ from pathlib import Path
10
+ from typing import List, Dict, Any, Optional
11
+
12
+ from deep_translator import GoogleTranslator
13
+
14
+
15
+ class FlashcardGenerator:
16
+ """Generate flashcards from OCR results with multi-language support"""
17
+
18
+ def __init__(self):
19
+ self.supported_languages = {
20
+ 'zh-cn': 'Chinese (Simplified)',
21
+ 'zh-tw': 'Chinese (Traditional)',
22
+ 'ja': 'Japanese',
23
+ 'ko': 'Korean',
24
+ 'en': 'English',
25
+ 'fr': 'French',
26
+ 'de': 'German',
27
+ 'es': 'Spanish',
28
+ 'ru': 'Russian',
29
+ }
30
+
31
+ self.lang_map = {
32
+ 'zh-cn': 'zh-CN',
33
+ 'zh-tw': 'zh-TW',
34
+ 'ja': 'ja',
35
+ 'ko': 'ko',
36
+ 'ru': 'ru',
37
+ }
38
+
39
+ self.translator_cache = {}
40
+
41
+ # Stop words for filtering common words
42
+ self.stop_words = {
43
+ 'zh-cn': {
44
+ '的', '了', '是', '在', '我', '有', '和', '就', '不', '人',
45
+ '都', '一个', '上', '也', '很', '到', '说', '要', '去', '你',
46
+ '会', '着', '没有', '看', '好', '自己', '这', '他', '她', '它',
47
+ '们', '个', '吗', '呢', '吧', '啊', '哦', '嗯', '呀'
48
+ },
49
+ 'en': {
50
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
51
+ 'for', 'of', 'with', 'by', 'from', 'is', 'am', 'are', 'was', 'were',
52
+ 'be', 'been', 'being', 'this', 'that', 'these', 'those', 'i', 'you',
53
+ 'he', 'she', 'it', 'we', 'they', 'my', 'your', 'his', 'her', 'its'
54
+ },
55
+ 'de': {
56
+ 'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer',
57
+ 'und', 'oder', 'aber', 'in', 'an', 'auf', 'für', 'mit', 'von',
58
+ 'zu', 'ist', 'sind', 'war', 'waren', 'ich', 'du', 'er', 'sie', 'es'
59
+ },
60
+ 'es': {
61
+ 'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas', 'y', 'o',
62
+ 'pero', 'en', 'a', 'de', 'con', 'por', 'para', 'es', 'son', 'era',
63
+ 'yo', 'tú', 'él', 'ella', 'nosotros', 'vosotros', 'ellos', 'ellas'
64
+ },
65
+ 'ja': {
66
+ 'の', 'に', 'は', 'を', 'た', 'が', 'で', 'て', 'と', 'し',
67
+ 'れ', 'さ', 'ある', 'いる', 'も', 'する', 'から', 'な', 'こ', 'そ'
68
+ },
69
+ 'ko': {
70
+ '은', '는', '이', '가', '을', '를', '의', '에', '에서', '로',
71
+ '와', '과', '도', '만', '까지', '부터', '하다', '되다', '있다', '없다'
72
+ },
73
+ 'ru': {
74
+ 'и', 'в', 'на', 'с', 'к', 'по', 'за', 'из', 'у', 'о',
75
+ 'а', 'но', 'что', 'это', 'как', 'он', 'она', 'они', 'мы', 'вы'
76
+ }
77
+ }
78
+
79
+ def extract_chinese_text(self, text: str) -> List[str]:
80
+ """Extract Chinese characters/phrases"""
81
+ pattern = re.compile(r'[\u4e00-\u9fff]+')
82
+ return pattern.findall(text)
83
+
84
+ def extract_japanese_text(self, text: str) -> List[str]:
85
+ """Extract Japanese text (kanji + hiragana + katakana)"""
86
+ pattern = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF]+')
87
+ return pattern.findall(text)
88
+
89
+ def extract_korean_text(self, text: str) -> List[str]:
90
+ """Extract Korean words"""
91
+ pattern = re.compile(r'[\uAC00-\uD7AF]+')
92
+ return pattern.findall(text)
93
+
94
+ def extract_european_words(self, text: str) -> List[str]:
95
+ """Extract words from European languages"""
96
+ pattern = re.compile(r'[a-zA-ZäöüßÄÖÜáéíóúñÁÉÍÓÚÑàèìòùÀÈÌÒÙ\u0400-\u04FF]+')
97
+ return pattern.findall(text)
98
+
99
+ def filter_by_length(self, items: List[str], min_len: int = 2, max_len: int = 15) -> List[str]:
100
+ """Filter items by character length"""
101
+ return [item for item in items if min_len <= len(item) <= max_len]
102
+
103
+ def filter_stop_words(self, items: List[str], language: str) -> List[str]:
104
+ """Remove common stop words"""
105
+ stop_words = self.stop_words.get(language, set())
106
+ if language in ['en', 'de', 'es', 'ru']:
107
+ return [item for item in items if item.lower() not in stop_words]
108
+ return [item for item in items if item not in stop_words]
109
+
110
+ def extract_vocabulary_by_language(self, text: str, language: str) -> List[str]:
111
+ """Extract vocabulary based on language type"""
112
+ if language in ['zh-cn', 'zh-tw']:
113
+ return self.extract_chinese_text(text)
114
+ elif language == 'ja':
115
+ return self.extract_japanese_text(text)
116
+ elif language == 'ko':
117
+ return self.extract_korean_text(text)
118
+ else:
119
+ return self.extract_european_words(text)
120
+
121
+ def get_sentence_delimiter(self, language: str) -> str:
122
+ """Get sentence delimiter pattern for a language"""
123
+ return r'[。!?.!?\n]+'
124
+
125
+ def extract_context_sentence(self, word: str, text: str, language: str = 'zh-cn') -> str:
126
+ """Extract context around the word"""
127
+ delimiter = self.get_sentence_delimiter(language)
128
+ sentences = re.split(delimiter, text)
129
+ sentences = [s.strip() for s in sentences if s.strip()]
130
+
131
+ if not sentences:
132
+ return ""
133
+
134
+ # Find sentence containing the word
135
+ word_sentence_idx = -1
136
+ for idx, sentence in enumerate(sentences):
137
+ if word in sentence:
138
+ word_sentence_idx = idx
139
+ break
140
+
141
+ if word_sentence_idx == -1:
142
+ return ""
143
+
144
+ word_sentence = sentences[word_sentence_idx]
145
+ is_same_as_sentence = (word_sentence == word or word_sentence.replace(' ', '') == word.replace(' ', ''))
146
+ is_title = (is_same_as_sentence and (word_sentence_idx <= 3 or word_sentence_idx < len(sentences) - 1))
147
+
148
+ context_sentences = []
149
+
150
+ if is_title:
151
+ context_sentences.append(word_sentence)
152
+ for i in range(word_sentence_idx + 1, min(word_sentence_idx + 3, len(sentences))):
153
+ next_sentence = sentences[i]
154
+ if len(next_sentence) > 3:
155
+ context_sentences.append(next_sentence)
156
+ break
157
+ else:
158
+ if word_sentence_idx > 0:
159
+ prev_sentence = sentences[word_sentence_idx - 1]
160
+ if len(prev_sentence) > 5:
161
+ context_sentences.append(prev_sentence)
162
+
163
+ context_sentences.append(word_sentence)
164
+
165
+ if word_sentence_idx < len(sentences) - 1:
166
+ next_sentence = sentences[word_sentence_idx + 1]
167
+ if len(next_sentence) > 5:
168
+ context_sentences.append(next_sentence)
169
+
170
+ if language in ['zh-cn', 'zh-tw', 'ja']:
171
+ context = ''.join(context_sentences)
172
+ else:
173
+ context = ' '.join(context_sentences)
174
+
175
+ if len(context) > 150:
176
+ context = context[:150] + '...'
177
+
178
+ return context
179
+
180
+ def translate_to_target(self, text: str, source_lang: str, target_lang: str = 'en') -> str:
181
+ """Translate text to target language"""
182
+ cache_key = f"{source_lang}:{target_lang}:{text}"
183
+ if cache_key in self.translator_cache:
184
+ return self.translator_cache[cache_key]
185
+
186
+ try:
187
+ source = self.lang_map.get(source_lang, source_lang)
188
+ target = self.lang_map.get(target_lang, target_lang)
189
+
190
+ translator = GoogleTranslator(source=source, target=target)
191
+ translation = translator.translate(text)
192
+
193
+ self.translator_cache[cache_key] = translation
194
+ return translation
195
+ except Exception as e:
196
+ return f"[Translation failed: {text}]"
197
+
198
+ def extract_learnable_items(self, ocr_result: Dict[str, Any], target_lang: str = 'en') -> List[Dict[str, Any]]:
199
+ """Extract vocabulary items from OCR result"""
200
+ original_text = ocr_result.get('original_text', '') or ocr_result.get('text', '')
201
+ language = ocr_result.get('detected_language', 'unknown')
202
+ filename = ocr_result.get('filename', '')
203
+
204
+ if not original_text or language == 'unknown':
205
+ return []
206
+
207
+ language = language.lower()
208
+
209
+ # Extract vocabulary
210
+ vocabulary_items = self.extract_vocabulary_by_language(original_text, language)
211
+
212
+ if not vocabulary_items:
213
+ return []
214
+
215
+ # Determine length constraints
216
+ if language in ['zh-cn', 'zh-tw', 'ja']:
217
+ min_len, max_len = 2, 6
218
+ elif language == 'ko':
219
+ min_len, max_len = 2, 10
220
+ else:
221
+ min_len, max_len = 3, 15
222
+
223
+ filtered_items = self.filter_by_length(vocabulary_items, min_len=min_len, max_len=max_len)
224
+ filtered_items = self.filter_stop_words(filtered_items, language)
225
+
226
+ # Remove duplicates
227
+ unique_items = list(dict.fromkeys(filtered_items))[:10]
228
+
229
+ if not unique_items:
230
+ return []
231
+
232
+ items = []
233
+ for idx, item in enumerate(unique_items):
234
+ # Get translation
235
+ if language == target_lang:
236
+ translation = item
237
+ else:
238
+ translation = self.translate_to_target(item, language, target_lang)
239
+
240
+ # Skip if translation is same as original
241
+ if translation.strip().lower() == item.strip().lower():
242
+ continue
243
+
244
+ # Extract context
245
+ context = self.extract_context_sentence(item, original_text, language)
246
+ context_translated = ""
247
+ if context and language != target_lang:
248
+ context_translated = self.translate_to_target(context, language, target_lang)
249
+
250
+ items.append({
251
+ 'id': idx + 1,
252
+ 'front': item,
253
+ 'back': translation,
254
+ 'context': context,
255
+ 'context_en': context_translated,
256
+ 'language': language,
257
+ 'content_type': 'ocr_vocab',
258
+ 'source_file': filename,
259
+ })
260
+
261
+ return items
262
+
263
+ def generate_flashcards(self, ocr_results: List[Dict[str, Any]], target_lang: str = 'en') -> Dict[str, Any]:
264
+ """Generate flashcards from OCR results"""
265
+ all_cards = []
266
+
267
+ for result in ocr_results:
268
+ learnable_items = self.extract_learnable_items(result, target_lang)
269
+ all_cards.extend(learnable_items)
270
+
271
+ return {
272
+ 'total_cards': len(all_cards),
273
+ 'cards': all_cards,
274
+ 'metadata': {
275
+ 'generator': 'FlashcardGenerator v2.0',
276
+ 'method': 'context-extraction',
277
+ }
278
+ }
279
+
280
+ def save_flashcards(self, flashcards: Dict[str, Any], output_path: str):
281
+ """Save flashcards to JSON file"""
282
+ with open(output_path, 'w', encoding='utf-8') as f:
283
+ json.dump(flashcards, f, ensure_ascii=False, indent=2)
284
+
285
+ def load_ocr_results(self, input_path: str) -> List[Dict[str, Any]]:
286
+ """Load OCR results from JSON file"""
287
+ with open(input_path, 'r', encoding='utf-8') as f:
288
+ return json.load(f)
src/app/flashcards_tools.py CHANGED
@@ -1,20 +1,33 @@
1
-
2
- # src/app/flashcards_tools.py
 
 
3
 
4
  import json
5
  import re
6
  from pathlib import Path
7
- from typing import Dict, List, Tuple, Optional
8
 
9
  from deep_translator import GoogleTranslator
10
 
11
  from .config import get_user_dir
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def _get_decks_dir(username: str) -> Path:
15
- """
16
- Returns the directory where all of a user's decks are stored.
17
- """
18
  user_dir = get_user_dir(username)
19
  decks_dir = user_dir / "decks"
20
  decks_dir.mkdir(parents=True, exist_ok=True)
@@ -22,11 +35,7 @@ def _get_decks_dir(username: str) -> Path:
22
 
23
 
24
  def list_user_decks(username: str) -> Dict[str, Path]:
25
- """
26
- Returns a mapping of deck name -> deck json path.
27
- Deck name is taken from the deck's "name" field if present,
28
- otherwise the filename stem.
29
- """
30
  decks_dir = _get_decks_dir(username)
31
  deck_files = sorted(decks_dir.glob("*.json"))
32
  decks: Dict[str, Path] = {}
@@ -38,7 +47,6 @@ def list_user_decks(username: str) -> Dict[str, Path]:
38
  except Exception:
39
  name = path.stem
40
 
41
- # ensure uniqueness by appending stem if needed
42
  if name in decks and decks[name] != path:
43
  name = f"{name} ({path.stem})"
44
  decks[name] = path
@@ -47,24 +55,31 @@ def list_user_decks(username: str) -> Dict[str, Path]:
47
 
48
 
49
  def _ensure_card_stats(card: Dict) -> None:
50
- """
51
- Ensure that a card has simple spaced-repetition stats.
52
- """
53
- if "score" not in card: # learning strength
54
  card["score"] = 0
55
  if "reviews" not in card:
56
  card["reviews"] = 0
57
 
58
 
 
 
 
 
 
 
 
 
 
 
 
59
  def load_deck(path: Path) -> Dict:
60
- """
61
- Loads a deck from JSON, ensuring 'cards' exists and that
62
- each card has basic stats for spaced repetition.
63
- """
64
  try:
65
  data = json.loads(path.read_text(encoding="utf-8"))
66
  except Exception:
67
  data = {}
 
68
  if "cards" not in data or not isinstance(data["cards"], list):
69
  data["cards"] = []
70
  if "name" not in data:
@@ -79,9 +94,7 @@ def load_deck(path: Path) -> Dict:
79
 
80
 
81
  def save_deck(path: Path, deck: Dict) -> None:
82
- """
83
- Saves deck to JSON.
84
- """
85
  if "cards" not in deck:
86
  deck["cards"] = []
87
  if "name" not in deck:
@@ -89,21 +102,14 @@ def save_deck(path: Path, deck: Dict) -> None:
89
  if "tags" not in deck or not isinstance(deck["tags"], list):
90
  deck["tags"] = []
91
 
92
- # make sure stats are present
93
  for card in deck["cards"]:
94
  _ensure_card_stats(card)
95
 
96
  path.write_text(json.dumps(deck, indent=2, ensure_ascii=False), encoding="utf-8")
97
 
98
 
99
- # ------------------------------------------------------------
100
- # Shared tokenization
101
- # ------------------------------------------------------------
102
-
103
  def _extract_candidate_words(text: str) -> List[str]:
104
- """
105
- Simple tokenizer & filter for candidate vocab words.
106
- """
107
  tokens = re.findall(r"\b\w+\b", text, flags=re.UNICODE)
108
  out = []
109
  seen = set()
@@ -121,28 +127,63 @@ def _extract_candidate_words(text: str) -> List[str]:
121
  return out
122
 
123
 
124
- # ------------------------------------------------------------
125
- # OCR → Flashcards
126
- # ------------------------------------------------------------
127
-
128
  def generate_flashcards_from_ocr_results(
129
  username: str,
130
  ocr_results: List[Dict],
131
  deck_name: str = "ocr",
132
  target_lang: str = "en",
133
  tags: Optional[List[str]] = None,
 
134
  ) -> Path:
135
  """
136
- Takes OCR results (as produced by ocr_tools.ocr_and_translate_batch)
137
- and constructs a simple vocab deck.
138
-
139
- ocr_results: list of dict with keys:
140
- - "text": original text
141
- - optionally other fields (ignored)
 
 
 
 
 
 
142
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  all_text = []
144
  for res in ocr_results:
145
- t = res.get("text") or res.get("raw_text") or ""
146
  if t:
147
  all_text.append(t)
148
  joined = "\n".join(all_text)
@@ -153,7 +194,7 @@ def generate_flashcards_from_ocr_results(
153
 
154
  translator = GoogleTranslator(source="auto", target=target_lang)
155
  cards = []
156
- for w in words:
157
  try:
158
  trans = translator.translate(w)
159
  except Exception:
@@ -162,12 +203,14 @@ def generate_flashcards_from_ocr_results(
162
  continue
163
  if trans.strip().lower() == w.strip().lower():
164
  continue
 
165
  card = {
166
  "front": w,
167
  "back": trans,
168
  "content_type": "ocr_vocab",
169
  "language": target_lang,
170
  }
 
171
  _ensure_card_stats(card)
172
  cards.append(card)
173
 
@@ -186,27 +229,73 @@ def generate_flashcards_from_ocr_results(
186
  return deck_path
187
 
188
 
189
- # ------------------------------------------------------------
190
- # Conversation/Text → Flashcards
191
- # ------------------------------------------------------------
192
-
193
  def generate_flashcards_from_text(
194
  username: str,
195
  text: str,
196
  deck_name: str = "conversation",
197
  target_lang: str = "en",
198
  tags: Optional[List[str]] = None,
 
199
  ) -> Path:
200
  """
201
- Build a vocab deck from raw conversation text.
 
 
 
 
 
 
 
 
 
 
 
202
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  words = _extract_candidate_words(text)
204
  if not words:
205
  raise ValueError("No candidate words found in text.")
206
 
207
  translator = GoogleTranslator(source="auto", target=target_lang)
208
  cards = []
209
- for w in words:
210
  try:
211
  trans = translator.translate(w)
212
  except Exception:
@@ -215,12 +304,14 @@ def generate_flashcards_from_text(
215
  continue
216
  if trans.strip().lower() == w.strip().lower():
217
  continue
 
218
  card = {
219
  "front": w,
220
  "back": trans,
221
  "content_type": "conversation_vocab",
222
  "language": target_lang,
223
  }
 
224
  _ensure_card_stats(card)
225
  cards.append(card)
226
 
@@ -239,3 +330,16 @@ def generate_flashcards_from_text(
239
  return deck_path
240
 
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Flashcards Tools - Enhanced with FlashcardGenerator and DifficultyScorer
4
+ """
5
 
6
  import json
7
  import re
8
  from pathlib import Path
9
+ from typing import Dict, List, Optional, Any
10
 
11
  from deep_translator import GoogleTranslator
12
 
13
  from .config import get_user_dir
14
 
15
+ # Import advanced generators (with fallback)
16
+ try:
17
+ from .flashcard_generator import FlashcardGenerator
18
+ HAS_FLASHCARD_GENERATOR = True
19
+ except ImportError:
20
+ HAS_FLASHCARD_GENERATOR = False
21
+
22
+ try:
23
+ from .difficulty_scorer import get_difficulty_scorer
24
+ HAS_DIFFICULTY_SCORER = True
25
+ except ImportError:
26
+ HAS_DIFFICULTY_SCORER = False
27
+
28
 
29
  def _get_decks_dir(username: str) -> Path:
30
+ """Returns the directory where all of a user's decks are stored."""
 
 
31
  user_dir = get_user_dir(username)
32
  decks_dir = user_dir / "decks"
33
  decks_dir.mkdir(parents=True, exist_ok=True)
 
35
 
36
 
37
  def list_user_decks(username: str) -> Dict[str, Path]:
38
+ """Returns a mapping of deck name -> deck json path."""
 
 
 
 
39
  decks_dir = _get_decks_dir(username)
40
  deck_files = sorted(decks_dir.glob("*.json"))
41
  decks: Dict[str, Path] = {}
 
47
  except Exception:
48
  name = path.stem
49
 
 
50
  if name in decks and decks[name] != path:
51
  name = f"{name} ({path.stem})"
52
  decks[name] = path
 
55
 
56
 
57
  def _ensure_card_stats(card: Dict) -> None:
58
+ """Ensure that a card has simple spaced-repetition stats."""
59
+ if "score" not in card:
 
 
60
  card["score"] = 0
61
  if "reviews" not in card:
62
  card["reviews"] = 0
63
 
64
 
65
+ def _add_difficulty_to_card(card: Dict) -> Dict:
66
+ """Add difficulty scoring to a card if DifficultyScorer is available."""
67
+ if HAS_DIFFICULTY_SCORER:
68
+ try:
69
+ scorer = get_difficulty_scorer()
70
+ return scorer.score_flashcard(card)
71
+ except Exception:
72
+ pass
73
+ return card
74
+
75
+
76
  def load_deck(path: Path) -> Dict:
77
+ """Loads a deck from JSON with stats for spaced repetition."""
 
 
 
78
  try:
79
  data = json.loads(path.read_text(encoding="utf-8"))
80
  except Exception:
81
  data = {}
82
+
83
  if "cards" not in data or not isinstance(data["cards"], list):
84
  data["cards"] = []
85
  if "name" not in data:
 
94
 
95
 
96
  def save_deck(path: Path, deck: Dict) -> None:
97
+ """Saves deck to JSON."""
 
 
98
  if "cards" not in deck:
99
  deck["cards"] = []
100
  if "name" not in deck:
 
102
  if "tags" not in deck or not isinstance(deck["tags"], list):
103
  deck["tags"] = []
104
 
 
105
  for card in deck["cards"]:
106
  _ensure_card_stats(card)
107
 
108
  path.write_text(json.dumps(deck, indent=2, ensure_ascii=False), encoding="utf-8")
109
 
110
 
 
 
 
 
111
  def _extract_candidate_words(text: str) -> List[str]:
112
+ """Simple tokenizer & filter for candidate vocab words."""
 
 
113
  tokens = re.findall(r"\b\w+\b", text, flags=re.UNICODE)
114
  out = []
115
  seen = set()
 
127
  return out
128
 
129
 
 
 
 
 
130
  def generate_flashcards_from_ocr_results(
131
  username: str,
132
  ocr_results: List[Dict],
133
  deck_name: str = "ocr",
134
  target_lang: str = "en",
135
  tags: Optional[List[str]] = None,
136
+ use_advanced_generator: bool = True,
137
  ) -> Path:
138
  """
139
+ Takes OCR results and constructs a vocab deck.
140
+
141
+ Args:
142
+ username: User identifier
143
+ ocr_results: List of OCR result dicts with 'text' key
144
+ deck_name: Name for the deck
145
+ target_lang: Target language for translations
146
+ tags: Optional tags for the deck
147
+ use_advanced_generator: Whether to use FlashcardGenerator
148
+
149
+ Returns:
150
+ Path to the saved deck
151
  """
152
+ # Try advanced generator first
153
+ if use_advanced_generator and HAS_FLASHCARD_GENERATOR:
154
+ try:
155
+ generator = FlashcardGenerator()
156
+ flashcard_data = generator.generate_flashcards(ocr_results, target_lang)
157
+ cards = flashcard_data.get('cards', [])
158
+
159
+ if cards:
160
+ # Add difficulty scores
161
+ if HAS_DIFFICULTY_SCORER:
162
+ scorer = get_difficulty_scorer()
163
+ cards = scorer.score_all_flashcards(cards)
164
+
165
+ # Ensure stats
166
+ for card in cards:
167
+ _ensure_card_stats(card)
168
+
169
+ decks_dir = _get_decks_dir(username)
170
+ deck_path = decks_dir / f"{deck_name}.json"
171
+
172
+ deck = {
173
+ "name": deck_name,
174
+ "cards": cards,
175
+ "tags": tags or ["ocr"],
176
+ "metadata": flashcard_data.get('metadata', {})
177
+ }
178
+ save_deck(deck_path, deck)
179
+ return deck_path
180
+ except Exception as e:
181
+ print(f"[flashcards_tools] Advanced generator failed: {e}, using fallback")
182
+
183
+ # Fallback to simple extraction
184
  all_text = []
185
  for res in ocr_results:
186
+ t = res.get("text") or res.get("raw_text") or res.get("original_text") or ""
187
  if t:
188
  all_text.append(t)
189
  joined = "\n".join(all_text)
 
194
 
195
  translator = GoogleTranslator(source="auto", target=target_lang)
196
  cards = []
197
+ for w in words[:20]: # Limit to 20 words
198
  try:
199
  trans = translator.translate(w)
200
  except Exception:
 
203
  continue
204
  if trans.strip().lower() == w.strip().lower():
205
  continue
206
+
207
  card = {
208
  "front": w,
209
  "back": trans,
210
  "content_type": "ocr_vocab",
211
  "language": target_lang,
212
  }
213
+ card = _add_difficulty_to_card(card)
214
  _ensure_card_stats(card)
215
  cards.append(card)
216
 
 
229
  return deck_path
230
 
231
 
 
 
 
 
232
  def generate_flashcards_from_text(
233
  username: str,
234
  text: str,
235
  deck_name: str = "conversation",
236
  target_lang: str = "en",
237
  tags: Optional[List[str]] = None,
238
+ source_lang: Optional[str] = None,
239
  ) -> Path:
240
  """
241
+ Build a vocab deck from raw text.
242
+
243
+ Args:
244
+ username: User identifier
245
+ text: Raw text to extract vocabulary from
246
+ deck_name: Name for the deck
247
+ target_lang: Target language for translations
248
+ tags: Optional tags for the deck
249
+ source_lang: Source language (auto-detect if None)
250
+
251
+ Returns:
252
+ Path to the saved deck
253
  """
254
+ # Try advanced generator first
255
+ if HAS_FLASHCARD_GENERATOR:
256
+ try:
257
+ generator = FlashcardGenerator()
258
+
259
+ # Create fake OCR result
260
+ ocr_result = {
261
+ 'original_text': text,
262
+ 'text': text,
263
+ 'detected_language': source_lang or 'auto',
264
+ }
265
+
266
+ flashcard_data = generator.generate_flashcards([ocr_result], target_lang)
267
+ cards = flashcard_data.get('cards', [])
268
+
269
+ if cards:
270
+ if HAS_DIFFICULTY_SCORER:
271
+ scorer = get_difficulty_scorer()
272
+ cards = scorer.score_all_flashcards(cards)
273
+
274
+ for card in cards:
275
+ card['content_type'] = 'conversation_vocab'
276
+ _ensure_card_stats(card)
277
+
278
+ decks_dir = _get_decks_dir(username)
279
+ deck_path = decks_dir / f"{deck_name}.json"
280
+
281
+ deck = {
282
+ "name": deck_name,
283
+ "cards": cards,
284
+ "tags": tags or ["conversation"],
285
+ }
286
+ save_deck(deck_path, deck)
287
+ return deck_path
288
+ except Exception as e:
289
+ print(f"[flashcards_tools] Advanced generator failed: {e}, using fallback")
290
+
291
+ # Fallback
292
  words = _extract_candidate_words(text)
293
  if not words:
294
  raise ValueError("No candidate words found in text.")
295
 
296
  translator = GoogleTranslator(source="auto", target=target_lang)
297
  cards = []
298
+ for w in words[:20]:
299
  try:
300
  trans = translator.translate(w)
301
  except Exception:
 
304
  continue
305
  if trans.strip().lower() == w.strip().lower():
306
  continue
307
+
308
  card = {
309
  "front": w,
310
  "back": trans,
311
  "content_type": "conversation_vocab",
312
  "language": target_lang,
313
  }
314
+ card = _add_difficulty_to_card(card)
315
  _ensure_card_stats(card)
316
  cards.append(card)
317
 
 
330
  return deck_path
331
 
332
 
333
+ def add_difficulty_to_deck(deck: Dict) -> Dict:
334
+ """Add difficulty scores to all cards in a deck."""
335
+ if not HAS_DIFFICULTY_SCORER:
336
+ return deck
337
+
338
+ try:
339
+ scorer = get_difficulty_scorer()
340
+ deck["cards"] = scorer.score_all_flashcards(deck.get("cards", []))
341
+ deck["statistics"] = scorer.get_statistics(deck["cards"])
342
+ except Exception as e:
343
+ print(f"[flashcards_tools] Difficulty scoring failed: {e}")
344
+
345
+ return deck
src/app/ocr_tools.py CHANGED
@@ -1,22 +1,331 @@
 
 
 
 
 
1
 
2
  import io
 
3
  from typing import Any, Dict, List, Optional
4
 
 
5
  from PIL import Image
6
  import pytesseract
7
  from deep_translator import GoogleTranslator
8
- from src.app.config import get_user_dir # keep this if you use it
9
 
10
- # REMOVED invalid placeholder import:
11
- # from .flashcards_tools import ...
 
 
 
 
12
 
13
- def _simple_ocr(image_bytes: bytes) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  """
15
- Fallback OCR using pytesseract.
 
16
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
18
- text = pytesseract.image_to_string(img)
19
- return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  def ocr_and_translate_batch(
@@ -25,40 +334,41 @@ def ocr_and_translate_batch(
25
  prefer_ocr_local: bool = True,
26
  ) -> List[Dict]:
27
  """
28
- Runs OCR on a batch of images. For now, we always use the
29
- simple pytesseract-based OCR, but the 'prefer_ocr_local'
30
- flag is kept for compatibility with previous versions that
31
- used a local PaddleOCR pipeline.
32
-
33
- Returns: list of dicts with keys:
34
- - "text": original OCR text
35
- - "translation": translation into target_lang
36
- - "target_lang": target_lang
37
  """
38
- translator = GoogleTranslator(source="auto", target=target_lang)
39
 
40
- results: List[Dict] = []
41
  for img_bytes in images:
42
- text = _simple_ocr(img_bytes)
43
- if text:
44
- try:
45
- translated = translator.translate(text)
46
- except Exception:
47
- translated = ""
48
-
49
- results.append(
50
- {
51
- "text": text,
52
- "translation": translated,
53
- "target_lang": target_lang,
54
- }
55
- )
56
- else:
57
- results.append(
58
- {
59
- "text": "",
60
- "translation": "",
61
- "target_lang": target_lang,
62
- }
63
- )
64
  return results
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ OCR Tools - Advanced text extraction with multi-language support
4
+ Supports: English, Chinese, Japanese, Korean, German, Spanish, Russian
5
+ """
6
 
7
  import io
8
+ import re
9
  from typing import Any, Dict, List, Optional
10
 
11
+ import numpy as np
12
  from PIL import Image
13
  import pytesseract
14
  from deep_translator import GoogleTranslator
 
15
 
16
+ # Try to import optional dependencies
17
+ try:
18
+ import cv2
19
+ HAS_CV2 = True
20
+ except ImportError:
21
+ HAS_CV2 = False
22
 
23
+ try:
24
+ from langdetect import detect
25
+ HAS_LANGDETECT = True
26
+ except ImportError:
27
+ HAS_LANGDETECT = False
28
+
29
+ try:
30
+ from paddleocr import PaddleOCR
31
+ HAS_PADDLEOCR = True
32
+ _paddle_ocr = None
33
+ except ImportError:
34
+ HAS_PADDLEOCR = False
35
+ _paddle_ocr = None
36
+
37
+
38
+ # Language code mapping
39
+ LANG_CODE_MAP = {
40
+ 'zh-cn': 'zh-CN',
41
+ 'zh-tw': 'zh-TW',
42
+ 'en': 'en',
43
+ 'ja': 'ja',
44
+ 'ko': 'ko',
45
+ 'fr': 'fr',
46
+ 'de': 'de',
47
+ 'es': 'es',
48
+ 'ru': 'ru',
49
+ }
50
+
51
+ # Tesseract language codes for each supported language
52
+ TESSERACT_LANG_MAP = {
53
+ 'en': 'eng',
54
+ 'english': 'eng',
55
+ 'zh-cn': 'chi_sim',
56
+ 'chinese': 'chi_sim',
57
+ 'zh-tw': 'chi_tra',
58
+ 'ja': 'jpn',
59
+ 'japanese': 'jpn',
60
+ 'ko': 'kor',
61
+ 'korean': 'kor',
62
+ 'de': 'deu',
63
+ 'german': 'deu',
64
+ 'es': 'spa',
65
+ 'spanish': 'spa',
66
+ 'ru': 'rus',
67
+ 'russian': 'rus',
68
+ 'fr': 'fra',
69
+ 'french': 'fra',
70
+ }
71
+
72
+
73
+ def _get_paddle_ocr():
74
+ """Lazily initialize PaddleOCR"""
75
+ global _paddle_ocr
76
+ if HAS_PADDLEOCR and _paddle_ocr is None:
77
+ try:
78
+ _paddle_ocr = PaddleOCR(use_textline_orientation=True, lang='ch', show_log=False)
79
+ except Exception as e:
80
+ print(f"[OCR] PaddleOCR init failed: {e}")
81
+ return _paddle_ocr
82
+
83
+
84
+ def filter_pinyin_keep_chinese(text: str) -> str:
85
  """
86
+ Filter out pinyin and keep only Chinese characters.
87
+ Preserves complete sentences with Chinese characters.
88
  """
89
+ lines = text.split('\n')
90
+ filtered_lines = []
91
+
92
+ for line in lines:
93
+ line_stripped = line.strip()
94
+ if not line_stripped:
95
+ continue
96
+
97
+ # Check if line contains Chinese characters
98
+ has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', line))
99
+
100
+ # Check if line is pure pinyin
101
+ is_pinyin = bool(re.match(r'^[a-zA-Z\u0101\u00e1\u01ce\u00e0\u0113\u00e9\u011b\u00e8\u012b\u00ed\u01d0\u00ec\u014d\u00f3\u01d2\u00f2\u016b\u00fa\u01d4\u00f9\u00fc\u01d6\u01d8\u01da\u01dc\u0144\u0148\u01f9\s]+$', line_stripped))
102
+
103
+ if is_pinyin:
104
+ continue
105
+
106
+ if has_chinese:
107
+ chinese_parts = re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf]+', line)
108
+ if chinese_parts:
109
+ filtered_lines.append(''.join(chinese_parts))
110
+
111
+ return '\n'.join(filtered_lines)
112
+
113
+
114
+ def detect_language_from_text(text: str) -> str:
115
+ """Detect language, with special handling for Chinese characters"""
116
+ has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', text))
117
+ if has_chinese:
118
+ return 'zh-cn'
119
+
120
+ has_japanese = bool(re.search(r'[\u3040-\u309f\u30a0-\u30ff]', text))
121
+ if has_japanese:
122
+ return 'ja'
123
+
124
+ has_korean = bool(re.search(r'[\uac00-\ud7af]', text))
125
+ if has_korean:
126
+ return 'ko'
127
+
128
+ if HAS_LANGDETECT:
129
+ try:
130
+ return detect(text)
131
+ except:
132
+ pass
133
+
134
+ return 'en'
135
+
136
+
137
+ def _preprocess_image(img_array: np.ndarray, method: str = 'simple') -> np.ndarray:
138
+ """Apply image preprocessing for better OCR accuracy"""
139
+ if not HAS_CV2:
140
+ return img_array
141
+
142
+ # Convert to grayscale if needed
143
+ if len(img_array.shape) == 3:
144
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
145
+ else:
146
+ gray = img_array
147
+
148
+ if method == 'simple':
149
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
150
+ return binary
151
+ elif method == 'adaptive':
152
+ return cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
153
+ elif method == 'clahe':
154
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
155
+ enhanced = clahe.apply(gray)
156
+ _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
157
+ return binary
158
+ elif method == 'denoised':
159
+ kernel = np.ones((2, 2), np.uint8)
160
+ denoised = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel, iterations=1)
161
+ _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
162
+ return binary
163
+ elif method == 'advanced':
164
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
165
+ enhanced = clahe.apply(gray)
166
+ denoised = cv2.fastNlMeansDenoising(enhanced, None, 10, 7, 21)
167
+ return cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
168
+ else:
169
+ return gray
170
+
171
+
172
+ def _ocr_with_paddleocr(image_bytes: bytes) -> tuple:
173
+ """Use PaddleOCR for text extraction (best for Chinese)"""
174
+ paddle = _get_paddle_ocr()
175
+ if paddle is None:
176
+ return None, 0
177
+
178
+ try:
179
+ img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
180
+ img_array = np.array(img)
181
+
182
+ result = paddle.ocr(img_array, cls=True)
183
+
184
+ if not result or len(result) == 0 or result[0] is None:
185
+ return None, 0
186
+
187
+ texts = []
188
+ scores = []
189
+ for line in result[0]:
190
+ if line and len(line) >= 2:
191
+ text_info = line[1]
192
+ if isinstance(text_info, tuple) and len(text_info) >= 2:
193
+ texts.append(text_info[0])
194
+ scores.append(text_info[1])
195
+
196
+ if not texts:
197
+ return None, 0
198
+
199
+ full_text = '\n'.join(texts)
200
+ avg_confidence = sum(scores) / len(scores) if scores else 0
201
+
202
+ return full_text, avg_confidence * 100
203
+
204
+ except Exception as e:
205
+ print(f"[OCR] PaddleOCR error: {e}")
206
+ return None, 0
207
+
208
+
209
+ def _ocr_with_tesseract(image_bytes: bytes, lang: str = 'eng+chi_sim+jpn+kor') -> tuple:
210
+ """Use Tesseract with multiple preprocessing methods"""
211
  img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
212
+ img_array = np.array(img)
213
+
214
+ best_text = ""
215
+ best_confidence = 0
216
+ best_method = ""
217
+
218
+ # Try different preprocessing methods
219
+ methods = ['simple', 'adaptive', 'clahe', 'denoised']
220
+ if HAS_CV2:
221
+ methods.append('advanced')
222
+
223
+ for method in methods:
224
+ try:
225
+ if HAS_CV2:
226
+ processed = _preprocess_image(img_array, method)
227
+ processed_img = Image.fromarray(processed)
228
+ else:
229
+ processed_img = img
230
+
231
+ # Get OCR data with confidence
232
+ data = pytesseract.image_to_data(processed_img, lang=lang, output_type=pytesseract.Output.DICT)
233
+ text = pytesseract.image_to_string(processed_img, lang=lang)
234
+
235
+ # Calculate average confidence
236
+ confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
237
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0
238
+
239
+ if text.strip() and avg_confidence > best_confidence:
240
+ best_text = text
241
+ best_confidence = avg_confidence
242
+ best_method = method
243
+
244
+ except Exception as e:
245
+ continue
246
+
247
+ return best_text.strip(), best_confidence, best_method
248
+
249
+
250
+ def ocr_single_image(
251
+ image_bytes: bytes,
252
+ source_lang: Optional[str] = None,
253
+ target_lang: str = "en",
254
+ use_paddle: bool = True,
255
+ ) -> Dict[str, Any]:
256
+ """
257
+ Extract text from a single image and translate.
258
+
259
+ Args:
260
+ image_bytes: Raw image bytes
261
+ source_lang: Source language hint (auto-detect if None)
262
+ target_lang: Target language for translation
263
+ use_paddle: Whether to try PaddleOCR first
264
+
265
+ Returns:
266
+ Dict with original_text, translated_text, detected_language, confidence, method
267
+ """
268
+ best_text = ""
269
+ best_method = ""
270
+ best_confidence = 0
271
+
272
+ # Determine Tesseract language string
273
+ tess_lang = 'eng+chi_sim+chi_tra+jpn+kor+deu+spa+rus+fra'
274
+ if source_lang:
275
+ mapped = TESSERACT_LANG_MAP.get(source_lang.lower())
276
+ if mapped:
277
+ tess_lang = mapped
278
+
279
+ # Try PaddleOCR first (best for Chinese)
280
+ if use_paddle and HAS_PADDLEOCR:
281
+ paddle_text, paddle_conf = _ocr_with_paddleocr(image_bytes)
282
+ if paddle_text and paddle_text.strip():
283
+ best_text = paddle_text
284
+ best_method = "PaddleOCR"
285
+ best_confidence = paddle_conf
286
+
287
+ # Try Tesseract (fallback or if PaddleOCR failed)
288
+ if not best_text.strip():
289
+ tess_text, tess_conf, tess_method = _ocr_with_tesseract(image_bytes, tess_lang)
290
+ if tess_text and (tess_conf > best_confidence or not best_text):
291
+ best_text = tess_text
292
+ best_method = f"Tesseract-{tess_method}"
293
+ best_confidence = tess_conf
294
+
295
+ if not best_text.strip():
296
+ return {
297
+ "original_text": "",
298
+ "translated_text": "",
299
+ "detected_language": "unknown",
300
+ "confidence": 0,
301
+ "method": "none",
302
+ "error": "No text detected"
303
+ }
304
+
305
+ # Filter pinyin for Chinese text
306
+ filtered_text = filter_pinyin_keep_chinese(best_text)
307
+ if not filtered_text.strip():
308
+ filtered_text = best_text
309
+
310
+ # Detect language
311
+ detected_lang = detect_language_from_text(filtered_text)
312
+
313
+ # Translate
314
+ try:
315
+ source = LANG_CODE_MAP.get(detected_lang, detected_lang)
316
+ target = LANG_CODE_MAP.get(target_lang, target_lang)
317
+ translator = GoogleTranslator(source=source, target=target)
318
+ translated = translator.translate(filtered_text)
319
+ except Exception as e:
320
+ translated = ""
321
+
322
+ return {
323
+ "original_text": filtered_text.strip(),
324
+ "translated_text": translated.strip() if translated else "",
325
+ "detected_language": detected_lang,
326
+ "confidence": round(best_confidence, 2),
327
+ "method": best_method
328
+ }
329
 
330
 
331
  def ocr_and_translate_batch(
 
334
  prefer_ocr_local: bool = True,
335
  ) -> List[Dict]:
336
  """
337
+ Runs OCR on a batch of images with advanced processing.
338
+
339
+ Args:
340
+ images: List of image bytes
341
+ target_lang: Target language for translation
342
+ prefer_ocr_local: Whether to prefer local OCR (PaddleOCR)
343
+
344
+ Returns:
345
+ List of dicts with OCR results
346
  """
347
+ results = []
348
 
 
349
  for img_bytes in images:
350
+ result = ocr_single_image(
351
+ image_bytes=img_bytes,
352
+ target_lang=target_lang,
353
+ use_paddle=prefer_ocr_local and HAS_PADDLEOCR
354
+ )
355
+
356
+ # Convert to expected format for backward compatibility
357
+ results.append({
358
+ "text": result.get("original_text", ""),
359
+ "translation": result.get("translated_text", ""),
360
+ "target_lang": target_lang,
361
+ "detected_language": result.get("detected_language", "unknown"),
362
+ "confidence": result.get("confidence", 0),
363
+ "method": result.get("method", "unknown"),
364
+ })
365
+
 
 
 
 
 
 
366
  return results
367
+
368
+
369
+ # Keep old function for backward compatibility
370
+ def _simple_ocr(image_bytes: bytes) -> str:
371
+ """Simple OCR using pytesseract (backward compatibility)"""
372
+ img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
373
+ text = pytesseract.image_to_string(img)
374
+ return text.strip()
src/app/quiz_tools.py CHANGED
@@ -1,17 +1,329 @@
1
-
2
- # src/app/quiz_tools.py
3
-
4
- # Placeholder restored because modifications moved to main_app.
5
- # This keeps the file present so import does not fail.
6
 
7
  import json
 
8
  import random
9
  from datetime import datetime
 
 
 
10
  from .config import get_user_dir
11
- from .flashcards_tools import ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
 
 
 
 
 
 
 
 
13
 
14
- def create_semantic_quiz_for_user(username: str, topic: str, num_questions: int = 5):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  reading_passages = [
16
  f"{topic.capitalize()} is important in daily life. Many people enjoy talking about it.",
17
  f"Here is a short story based on the topic '{topic}'.",
@@ -25,28 +337,25 @@ def create_semantic_quiz_for_user(username: str, topic: str, num_questions: int
25
 
26
  if q_type == "translate_phrase":
27
  questions.append({
 
28
  "type": "semantic_translate_phrase",
29
- "prompt": f"Translate:
30
-
31
- '{passage}'",
32
  "answer": "(model evaluated)",
33
  "explanation": f"Checks ability to translate topic '{topic}'."
34
  })
35
  elif q_type == "summarize":
36
  questions.append({
 
37
  "type": "semantic_summarize",
38
- "prompt": f"Summarize:
39
-
40
- {passage}",
41
  "answer": "(model evaluated)",
42
  "explanation": f"Checks comprehension of topic '{topic}'."
43
  })
44
  elif q_type == "interpret":
45
  questions.append({
 
46
  "type": "semantic_interpret",
47
- "prompt": f"Interpret meaning:
48
-
49
- {passage}",
50
  "answer": "(model evaluated)",
51
  "explanation": f"Checks conceptual understanding of '{topic}'."
52
  })
@@ -58,5 +367,59 @@ def create_semantic_quiz_for_user(username: str, topic: str, num_questions: int
58
  "id": quiz_id,
59
  "created_at": ts,
60
  "topic": topic,
 
61
  "questions": questions,
62
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Quiz Tools - AI-Powered Quiz Generation from Flashcards
4
+ Supports multiple question types and uses OpenAI API for intelligent quiz creation
5
+ """
6
 
7
  import json
8
+ import os
9
  import random
10
  from datetime import datetime
11
+ from pathlib import Path
12
+ from typing import Dict, List, Any, Optional
13
+
14
  from .config import get_user_dir
15
+ from .flashcards_tools import load_deck, list_user_decks
16
+
17
+ # Try to import OpenAI
18
+ try:
19
+ from openai import OpenAI
20
+ HAS_OPENAI = True
21
+ except ImportError:
22
+ HAS_OPENAI = False
23
+
24
+
25
+ class QuizGenerator:
26
+ """Generate intelligent quizzes using OpenAI API"""
27
+
28
+ QUESTION_TYPES = [
29
+ 'multiple_choice',
30
+ 'fill_in_blank',
31
+ 'true_false',
32
+ 'matching',
33
+ 'short_answer'
34
+ ]
35
+
36
+ def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o-mini"):
37
+ """
38
+ Initialize the quiz generator
39
+
40
+ Args:
41
+ api_key: OpenAI API key (uses env var if not provided)
42
+ model: Model to use for quiz generation
43
+ """
44
+ self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
45
+ self.model = model
46
+ self.client = None
47
+
48
+ if HAS_OPENAI and self.api_key:
49
+ try:
50
+ self.client = OpenAI(api_key=self.api_key)
51
+ except Exception as e:
52
+ print(f"[QuizGenerator] OpenAI init failed: {e}")
53
+
54
+ def _prepare_flashcard_context(self, flashcards: List[Dict], max_cards: int = 20) -> str:
55
+ """Prepare flashcard data as context for AI"""
56
+ selected_cards = flashcards[:max_cards] if len(flashcards) > max_cards else flashcards
57
+
58
+ context_parts = []
59
+ for idx, card in enumerate(selected_cards, 1):
60
+ card_info = (
61
+ f"{idx}. Word: {card.get('front', '')}\n"
62
+ f" Translation: {card.get('back', '')}\n"
63
+ f" Language: {card.get('language', 'unknown')}\n"
64
+ f" Context: {card.get('context', 'N/A')}"
65
+ )
66
+ context_parts.append(card_info)
67
+
68
+ return "\n\n".join(context_parts)
69
+
70
+ def _create_quiz_prompt(self, flashcards: List[Dict], num_questions: int = 30) -> str:
71
+ """Create the prompt for AI quiz generation"""
72
+ flashcard_context = self._prepare_flashcard_context(flashcards)
73
+
74
+ prompt = f"""You are an expert language teacher creating a QUESTION BANK to test students' knowledge of vocabulary.
75
+
76
+ Based on the following flashcards, generate exactly {num_questions} diverse quiz questions.
77
+
78
+ FLASHCARDS:
79
+ {flashcard_context}
80
+
81
+ REQUIREMENTS:
82
+ 1. Generate exactly {num_questions} questions
83
+ 2. Use different question types: multiple_choice, fill_in_blank, true_false, matching, short_answer
84
+ 3. Questions should test different aspects: vocabulary recall, context understanding, usage
85
+ 4. Each question must include the correct answer
86
+ 5. For multiple choice questions, provide 4 options with one correct answer
87
+ 6. For matching questions, provide 4 word-translation pairs
88
+ 7. Make questions challenging but fair
89
+ 8. Vary difficulty levels across questions
90
+
91
+ OUTPUT FORMAT (JSON):
92
+ {{
93
+ "quiz_title": "Vocabulary Quiz",
94
+ "total_questions": {num_questions},
95
+ "questions": [
96
+ {{
97
+ "question_number": 1,
98
+ "type": "multiple_choice",
99
+ "question": "What does 'word' mean?",
100
+ "options": ["Option A", "Option B", "Option C", "Option D"],
101
+ "correct_answer": "Option B",
102
+ "explanation": "Brief explanation."
103
+ }},
104
+ {{
105
+ "question_number": 2,
106
+ "type": "fill_in_blank",
107
+ "question": "Complete: The ___ ran quickly.",
108
+ "correct_answer": "cat",
109
+ "explanation": "Brief explanation."
110
+ }},
111
+ {{
112
+ "question_number": 3,
113
+ "type": "true_false",
114
+ "question": "'Word' means 'definition' in English.",
115
+ "correct_answer": false,
116
+ "explanation": "Brief explanation."
117
+ }},
118
+ {{
119
+ "question_number": 4,
120
+ "type": "matching",
121
+ "question": "Match the words to their correct translations",
122
+ "pairs": [
123
+ {{"word": "word1", "translation": "translation1"}},
124
+ {{"word": "word2", "translation": "translation2"}},
125
+ {{"word": "word3", "translation": "translation3"}},
126
+ {{"word": "word4", "translation": "translation4"}}
127
+ ],
128
+ "correct_answer": "All pairs are correctly matched",
129
+ "explanation": "Brief explanation."
130
+ }},
131
+ {{
132
+ "question_number": 5,
133
+ "type": "short_answer",
134
+ "question": "Explain the usage of 'word'.",
135
+ "correct_answer": "Model answer here.",
136
+ "explanation": "Brief explanation."
137
+ }}
138
+ ]
139
+ }}
140
+
141
+ Generate the quiz now:"""
142
+
143
+ return prompt
144
+
145
+ def generate_quiz_with_ai(self, flashcards: List[Dict], num_questions: int = 30) -> Dict[str, Any]:
146
+ """Generate quiz using OpenAI API"""
147
+ if not self.client:
148
+ raise ValueError("OpenAI client not initialized. Check API key.")
149
+
150
+ if not flashcards:
151
+ raise ValueError("No flashcards provided for quiz generation")
152
+
153
+ prompt = self._create_quiz_prompt(flashcards, num_questions)
154
+
155
+ try:
156
+ response = self.client.chat.completions.create(
157
+ model=self.model,
158
+ messages=[
159
+ {
160
+ "role": "system",
161
+ "content": "You are an expert language teacher who creates engaging, educational quizzes. Always respond with valid JSON."
162
+ },
163
+ {
164
+ "role": "user",
165
+ "content": prompt
166
+ }
167
+ ],
168
+ response_format={"type": "json_object"},
169
+ temperature=0.7,
170
+ max_tokens=4000
171
+ )
172
+
173
+ quiz_content = response.choices[0].message.content
174
+ quiz_data = json.loads(quiz_content)
175
+
176
+ quiz_data['metadata'] = {
177
+ 'generator': 'AI-Powered Quiz Generator',
178
+ 'model': self.model,
179
+ 'source_flashcards': len(flashcards),
180
+ 'tokens_used': response.usage.total_tokens if response.usage else 0
181
+ }
182
+
183
+ return quiz_data
184
+
185
+ except Exception as e:
186
+ print(f"[QuizGenerator] AI generation failed: {e}")
187
+ raise
188
+
189
+ def generate_simple_quiz(self, flashcards: List[Dict], num_questions: int = 5) -> Dict[str, Any]:
190
+ """Generate a simple quiz without AI (fallback)"""
191
+ if not flashcards:
192
+ raise ValueError("No flashcards provided")
193
+
194
+ questions = []
195
+ used_cards = random.sample(flashcards, min(num_questions * 2, len(flashcards)))
196
+
197
+ for i, card in enumerate(used_cards[:num_questions]):
198
+ q_type = random.choice(['multiple_choice', 'fill_in_blank', 'true_false'])
199
+
200
+ if q_type == 'multiple_choice':
201
+ # Create wrong options from other cards
202
+ other_cards = [c for c in flashcards if c != card]
203
+ wrong_options = random.sample(
204
+ [c.get('back', 'Unknown') for c in other_cards],
205
+ min(3, len(other_cards))
206
+ )
207
+ while len(wrong_options) < 3:
208
+ wrong_options.append(f"Not {card.get('back', 'this')}")
209
+
210
+ options = wrong_options + [card.get('back', '')]
211
+ random.shuffle(options)
212
+
213
+ questions.append({
214
+ "question_number": i + 1,
215
+ "type": "multiple_choice",
216
+ "question": f"What does '{card.get('front', '')}' mean?",
217
+ "options": options,
218
+ "correct_answer": card.get('back', ''),
219
+ "explanation": f"'{card.get('front', '')}' translates to '{card.get('back', '')}'."
220
+ })
221
 
222
+ elif q_type == 'fill_in_blank':
223
+ questions.append({
224
+ "question_number": i + 1,
225
+ "type": "fill_in_blank",
226
+ "question": f"Translate: '{card.get('front', '')}' = _____",
227
+ "correct_answer": card.get('back', ''),
228
+ "explanation": f"The correct translation is '{card.get('back', '')}'."
229
+ })
230
 
231
+ elif q_type == 'true_false':
232
+ is_true = random.choice([True, False])
233
+ if is_true:
234
+ shown_answer = card.get('back', '')
235
+ else:
236
+ other_cards = [c for c in flashcards if c != card]
237
+ if other_cards:
238
+ shown_answer = random.choice(other_cards).get('back', 'something else')
239
+ else:
240
+ shown_answer = f"Not {card.get('back', 'this')}"
241
+
242
+ questions.append({
243
+ "question_number": i + 1,
244
+ "type": "true_false",
245
+ "question": f"'{card.get('front', '')}' means '{shown_answer}'.",
246
+ "correct_answer": is_true,
247
+ "explanation": f"'{card.get('front', '')}' actually means '{card.get('back', '')}'."
248
+ })
249
+
250
+ return {
251
+ "quiz_title": "Vocabulary Quiz",
252
+ "total_questions": len(questions),
253
+ "questions": questions,
254
+ "metadata": {
255
+ "generator": "Simple Quiz Generator",
256
+ "source_flashcards": len(flashcards)
257
+ }
258
+ }
259
+
260
+
261
+ def create_quiz_from_deck(
262
+ username: str,
263
+ deck_name: str,
264
+ num_questions: int = 5,
265
+ use_ai: bool = True,
266
+ api_key: Optional[str] = None
267
+ ) -> Dict[str, Any]:
268
+ """
269
+ Create a quiz from a user's flashcard deck
270
+
271
+ Args:
272
+ username: User identifier
273
+ deck_name: Name of the deck to create quiz from
274
+ num_questions: Number of questions for the quiz session
275
+ use_ai: Whether to use AI for quiz generation
276
+ api_key: Optional OpenAI API key
277
+
278
+ Returns:
279
+ Quiz dictionary with questions
280
+ """
281
+ decks = list_user_decks(username)
282
+
283
+ if deck_name not in decks:
284
+ raise ValueError(f"Deck '{deck_name}' not found")
285
+
286
+ deck = load_deck(decks[deck_name])
287
+ flashcards = deck.get('cards', [])
288
+
289
+ if not flashcards:
290
+ raise ValueError(f"Deck '{deck_name}' has no cards")
291
+
292
+ generator = QuizGenerator(api_key=api_key)
293
+
294
+ try:
295
+ if use_ai and generator.client:
296
+ # Generate larger question bank with AI
297
+ quiz = generator.generate_quiz_with_ai(flashcards, num_questions=30)
298
+ else:
299
+ # Use simple generator
300
+ quiz = generator.generate_simple_quiz(flashcards, num_questions=num_questions)
301
+ except Exception as e:
302
+ print(f"[quiz_tools] AI quiz generation failed: {e}, using simple generator")
303
+ quiz = generator.generate_simple_quiz(flashcards, num_questions=num_questions)
304
+
305
+ # Add quiz metadata
306
+ ts = datetime.utcnow().strftime("%Y-%m-%dT%H-%M-%SZ")
307
+ quiz['id'] = f"quiz_{ts}"
308
+ quiz['created_at'] = ts
309
+ quiz['deck_name'] = deck_name
310
+ quiz['questions_per_session'] = num_questions
311
+
312
+ return quiz
313
+
314
+
315
+ def create_semantic_quiz_for_user(username: str, topic: str, num_questions: int = 5) -> Dict[str, Any]:
316
+ """
317
+ Create a semantic quiz based on a topic (for conversation practice)
318
+
319
+ Args:
320
+ username: User identifier
321
+ topic: Topic for the quiz
322
+ num_questions: Number of questions
323
+
324
+ Returns:
325
+ Quiz dictionary
326
+ """
327
  reading_passages = [
328
  f"{topic.capitalize()} is important in daily life. Many people enjoy talking about it.",
329
  f"Here is a short story based on the topic '{topic}'.",
 
337
 
338
  if q_type == "translate_phrase":
339
  questions.append({
340
+ "question_number": i + 1,
341
  "type": "semantic_translate_phrase",
342
+ "prompt": f"Translate:\n\n'{passage}'",
 
 
343
  "answer": "(model evaluated)",
344
  "explanation": f"Checks ability to translate topic '{topic}'."
345
  })
346
  elif q_type == "summarize":
347
  questions.append({
348
+ "question_number": i + 1,
349
  "type": "semantic_summarize",
350
+ "prompt": f"Summarize:\n\n{passage}",
 
 
351
  "answer": "(model evaluated)",
352
  "explanation": f"Checks comprehension of topic '{topic}'."
353
  })
354
  elif q_type == "interpret":
355
  questions.append({
356
+ "question_number": i + 1,
357
  "type": "semantic_interpret",
358
+ "prompt": f"Interpret meaning:\n\n{passage}",
 
 
359
  "answer": "(model evaluated)",
360
  "explanation": f"Checks conceptual understanding of '{topic}'."
361
  })
 
367
  "id": quiz_id,
368
  "created_at": ts,
369
  "topic": topic,
370
+ "total_questions": len(questions),
371
  "questions": questions,
372
  }
373
+
374
+
375
+ def save_quiz(username: str, quiz: Dict[str, Any]) -> Path:
376
+ """Save a quiz to the user's directory"""
377
+ user_dir = get_user_dir(username)
378
+ quizzes_dir = user_dir / "quizzes"
379
+ quizzes_dir.mkdir(parents=True, exist_ok=True)
380
+
381
+ quiz_id = quiz.get('id', f"quiz_{datetime.utcnow().strftime('%Y%m%d%H%M%S')}")
382
+ quiz_path = quizzes_dir / f"{quiz_id}.json"
383
+
384
+ with open(quiz_path, 'w', encoding='utf-8') as f:
385
+ json.dump(quiz, f, ensure_ascii=False, indent=2)
386
+
387
+ return quiz_path
388
+
389
+
390
+ def load_quiz(username: str, quiz_id: str) -> Dict[str, Any]:
391
+ """Load a saved quiz"""
392
+ user_dir = get_user_dir(username)
393
+ quiz_path = user_dir / "quizzes" / f"{quiz_id}.json"
394
+
395
+ if not quiz_path.exists():
396
+ raise FileNotFoundError(f"Quiz '{quiz_id}' not found")
397
+
398
+ with open(quiz_path, 'r', encoding='utf-8') as f:
399
+ return json.load(f)
400
+
401
+
402
+ def list_user_quizzes(username: str) -> List[Dict[str, Any]]:
403
+ """List all quizzes for a user"""
404
+ user_dir = get_user_dir(username)
405
+ quizzes_dir = user_dir / "quizzes"
406
+
407
+ if not quizzes_dir.exists():
408
+ return []
409
+
410
+ quizzes = []
411
+ for quiz_file in sorted(quizzes_dir.glob("*.json"), reverse=True):
412
+ try:
413
+ with open(quiz_file, 'r', encoding='utf-8') as f:
414
+ quiz = json.load(f)
415
+ quizzes.append({
416
+ 'id': quiz.get('id', quiz_file.stem),
417
+ 'title': quiz.get('quiz_title', 'Untitled Quiz'),
418
+ 'created_at': quiz.get('created_at', ''),
419
+ 'total_questions': quiz.get('total_questions', 0),
420
+ 'deck_name': quiz.get('deck_name', ''),
421
+ })
422
+ except Exception:
423
+ continue
424
+
425
+ return quizzes