diff --git "a/scan_html_folder.py" "b/scan_html_folder.py" new file mode 100644--- /dev/null +++ "b/scan_html_folder.py" @@ -0,0 +1,4789 @@ +""" +Enhanced QA Scanner for HTML Translation Files + +This module provides comprehensive quality assurance scanning for translated HTML files, +including duplicate detection, foreign character detection, and translation artifact detection. + +PERFORMANCE IMPROVEMENTS: +- Added detailed progress indicators for all slow operations +- Shows estimated time remaining for long operations +- Displays current file being scanned +- Provides progress updates every 5-10% +- Added timing information for each phase +- MinHash optimization status messages +- Debug output for stop functionality + +OPTIMIZATION TIPS: +- For datasets > 100 files, avoid AI Hunter mode (use aggressive instead) +- Install 'datasketch' package for 2-10x faster duplicate detection: pip install datasketch +- Use 'summary' report format for faster completion +- Disable checks you don't need in QA Scanner Settings +""" + + +import os +import hashlib +import json +import zipfile +import csv +from bs4 import BeautifulSoup +from langdetect import detect, LangDetectException +from difflib import SequenceMatcher +from collections import Counter, defaultdict +from tqdm import tqdm +import tkinter as tk +from tkinter import filedialog, messagebox +import threading +import re +import unicodedata +import time +import html as html_lib +from typing import Dict, List, Tuple, Set, Optional +import warnings +from functools import lru_cache +import concurrent.futures +import multiprocessing +from threading import Lock + +# Add a global lock for thread-safe operations +merge_lock = Lock() + +# Global variable for text samples mapping +_global_text_samples = {} + +warnings.filterwarnings('ignore') + +# Try to import optional dependencies +try: + from datasketch import MinHash, MinHashLSH + MINHASH_AVAILABLE = True +except ImportError: + MINHASH_AVAILABLE = False + #"Note: Install 'datasketch' package for faster duplicate detection on large datasets if running it as a script + +# Global flag to allow stopping the scan externally +_stop_flag = False + +def stop_scan(): + """Set the stop flag to True + + This function should be called by the GUI to stop a running scan. + The GUI code needs to: + 1. Import this function: from scan_html_folder import stop_scan + 2. Call it in the stop_qa_scan method: stop_scan() + 3. Update the QA button to show "Stop Scan" when scan is running + """ + global _stop_flag + _stop_flag = True + print("π STOP SCAN CALLED - Global flag set to True") # More visible debug + return True # Return True to confirm it was called + +# Configuration class for duplicate detection +class DuplicateDetectionConfig: + def __init__(self, mode='quick-scan', custom_settings=None): + self.mode = mode + self.custom_settings = custom_settings + self.thresholds = { + 'aggressive': { + 'similarity': 0.75, + 'semantic': 0.70, + 'structural': 0.80, + 'consecutive_chapters': 3, + 'word_overlap': 0.65, + 'minhash_threshold': 0.70 + }, + 'quick-scan': { # Optimized for speed + 'similarity': 0.85, + 'semantic': 0.80, + 'structural': 0.90, + 'consecutive_chapters': 1, # Only check adjacent chapters + 'word_overlap': 0.75, + 'minhash_threshold': 0.80, + 'skip_semantic': True, # Skip expensive calculations + 'skip_structural': True, + 'skip_minhash': True, + 'sample_size': 1000, # Smaller sample + 'check_all_pairs': False # Never check all pairs + }, + 'custom': { + 'similarity': 0.85, + 'semantic': 0.80, + 'structural': 0.90, + 'consecutive_chapters': 2, + 'word_overlap': 0.75, + 'minhash_threshold': 0.80, + 'check_all_pairs': False, + 'sample_size': 3000, + 'min_text_length': 500 + }, + 'ai-hunter': { + 'similarity': 0.30, + 'semantic': 0.85, + 'structural': 0.85, + 'consecutive_chapters': 5, + 'word_overlap': 0.50, + 'minhash_threshold': 0.60, + 'check_all_pairs': True + } + } + + # Override with custom settings if mode is 'custom' + if mode == 'custom' and custom_settings: + self.thresholds['custom'].update(custom_settings.get('thresholds', {})) + for key in ['consecutive_chapters', 'check_all_pairs', 'sample_size', 'min_text_length']: + if key in custom_settings: + self.thresholds['custom'][key] = custom_settings[key] + + def get_threshold(self, key): + return self.thresholds[self.mode].get(key, 0.8) + +# Constants +DASH_CHARS = { + '-', 'β', 'β', 'β', 'βΈΊ', 'βΈ»', 'οΉ', 'οΉ£', 'οΌ', 'β', 'β', 'β', 'β', + '_', 'β', 'β', 'β', 'β', 'β', 'β', 'β ', 'β', 'β', 'β―', 'β€', 'οΌΏ', + 'οΌ', '*', '~', 'ο½', 'βΌ', 'γ', 'γ ‘' # Added Korean dash character +} + +COMMON_WORDS = { + 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', + 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'after', + 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', + 'do', 'does', 'did', 'will', 'would', 'should', 'could', 'may', 'might', + 'chapter', 'each', 'person', 'persons', 'he', 'she', 'it', 'they', 'them', + 'his', 'her', 'their', 'this', 'that', 'these', 'those', 'which', 'who', + 'what', 'where', 'when', 'why', 'how', 'all', 'some', 'any', 'no', 'not' +} + +# Korean dash patterns to EXCLUDE from detection +KOREAN_DASH_PATTERNS = [ + r'[γ ‘βββ\-]+', # Korean dashes and similar + r'[\u2014\u2015\u2500-\u257F]+', # Box drawing characters often used in Korean text + r'[\u3161\u3163\u3164]+', # Korean filler characters +] + +# Extended Korean separator characters to exclude from non-English detection +KOREAN_SEPARATOR_CHARS = { + 'γ ‘', # Korean dash/separator (U+3161) + 'β', # Horizontal bar (U+2015) + 'β', # Em dash (U+2014) + 'β', # En dash (U+2013) + 'οΌ»', 'οΌ½', # Full-width brackets + 'γ', 'γ', # Black lenticular brackets + 'γ', 'γ', # Tortoise shell brackets + 'γ', 'γ', # Double angle brackets + 'γ', 'γ', # Corner brackets + 'γ', 'γ', # White corner brackets +} + +# Translation artifacts patterns +TRANSLATION_ARTIFACTS = { + 'machine_translation': re.compile(r'(MTL note|TN:|Translator:|T/N:|TL note:|Translator\'s note:)', re.IGNORECASE), + 'encoding_issues': re.compile(r'[οΏ½β‘β]{2,}'), + 'repeated_watermarks': re.compile(r'(\[[\w\s]+\.(?:com|net|org)\])\s*\1{2,}', re.IGNORECASE), + 'chapter_continuation': re.compile(r'(to be continued|continued from|continuation of|cont\.)', re.IGNORECASE), + 'split_indicators': re.compile(r'(part \d+|section \d+|\(\d+/\d+\))', re.IGNORECASE), + 'api_response_unavailable': re.compile(r'\[AI RESPONSE UNAVAILABLE\]|\[TRANSLATION FAILED - ORIGINAL TEXT PRESERVED\]|\[IMAGE TRANSLATION FAILED\]', re.IGNORECASE), + + 'glossary_leakage_csv': re.compile( + r'(?:type|raw_name|translated_name|gender|description)\s*,\s*(?:type|raw_name|translated_name|gender|description)', + re.IGNORECASE + ), + 'glossary_leakage_json': re.compile( + r'"(?:type|raw_name|translated_name|gender|description)"\s*:\s*"[^"]+"\s*,?\s*"(?:type|raw_name|translated_name|gender|description)"', + re.IGNORECASE + ) +} +# Cache configuration - will be updated by configure_qa_cache() +_cache_config = { + "enabled": True, + "sizes": { + "normalize_text": 10000, + "similarity_ratio": 20000, + "content_hashes": 5000, + "semantic_fingerprint": 2000, + "structural_signature": 2000, + "semantic_similarity": 5000, + "structural_similarity": 5000, + "file_extraction": 200 + } +} + +def configure_qa_cache(config): + """Update cache configuration""" + global _cache_config + _cache_config.update(config) + # Clear existing caches after configuration + clear_qa_caches() + # Re-apply caches with new sizes + _apply_caches() + +def get_cache_size(func_name): + """Get configured cache size for a function""" + if not _cache_config.get("enabled", True): + return 0 # Disable cache + + size = _cache_config.get("sizes", {}).get(func_name, 1000) + return None if size == -1 else size + +# Define functions WITHOUT decorators first +def extract_semantic_fingerprint_impl(text): + """Extract semantic fingerprint and signature from text""" + # For cache efficiency with long texts + cache_text = text[:50000] if len(text) > 50000 else text + + # Extract features for semantic analysis + words = cache_text.lower().split() + + # Character names (words starting with capital letters, appearing multiple times) + potential_names = re.findall(r'\b[A-Z][a-z]+\b', cache_text) + name_freq = Counter(potential_names) + characters = [name for name, count in name_freq.items() + if count >= 3 and name not in COMMON_WORDS] + + # Dialogue analysis + dialogue_matches = re.findall(r'["\"\'""''γγγγ]([^"\"\'""''γγγγ]+)["\"\'""''γγγγ]', cache_text) + dialogue_count = len(dialogue_matches) + dialogue_density = dialogue_count / max(1, len(words)) if words else 0 + dialogue_lengths = [len(d) for d in dialogue_matches[:30]] # First 30 dialogue lengths + + # Character frequencies (sorted list) + character_frequencies = [count for _, count in name_freq.most_common()] + + # Speaker sequence extraction + speaker_patterns = re.findall(r'(\w+)\s+(?:said|asked|replied|shouted|whispered|spoke)', cache_text.lower()) + speaker_sequence = speaker_patterns[:50] # First 50 speakers + + # Paragraph structure (lengths of each paragraph) + paragraphs = [p for p in cache_text.split('\n\n') if p.strip()] + paragraph_structure = [len(p) for p in paragraphs[:50]] # First 50 paragraph lengths + + # Action words density + action_words = len(re.findall(r'\b(\w+ed|spoke|says?|asks?|replies?|shouts?|screams?|whispers?)\b', cache_text)) + action_density = action_words / max(1, len(words)) if words else 0 + + # Numbers in text + numbers = re.findall(r'\b\d+\b', cache_text) + + # Create fingerprint string + fingerprint = f"chars:{len(characters)}_dial:{dialogue_density:.2f}_act:{action_density:.2f}_nums:{len(numbers)}_words:{len(words)}" + + # Create signature dict + signature = { + 'characters': characters[:20], # Top 20 characters + 'dialogue_density': dialogue_density, + 'dialogue_count': dialogue_count, + 'dialogue_lengths': dialogue_lengths, + 'character_frequencies': character_frequencies, + 'speaker_sequence': speaker_sequence, + 'paragraph_structure': paragraph_structure, + 'total_words': len(words), + 'action_density': action_density, + 'numbers': numbers[:50], # First 50 numbers + 'text_length': len(cache_text) + } + + return fingerprint, signature + +def extract_structural_signature_impl(text): + """Extract structural patterns from text""" + # For cache efficiency with long texts + cache_text = text[:50000] if len(text) > 50000 else text + + lines = cache_text.split('\n') + + # Count different types of lines + para_count = len([l for l in lines if len(l.strip()) > 50]) + short_lines = len([l for l in lines if 0 < len(l.strip()) < 20]) + empty_lines = len([l for l in lines if not l.strip()]) + + # Dialogue patterns + dialogue_lines = len(re.findall(r'["\"\'""''γγγγ].*?["\"\'""''γγγγ]', cache_text)) + + # Create pattern string (first letter of each line type) + pattern = '' + for line in lines[:100]: # First 100 lines + if not line.strip(): + pattern += 'E' # Empty + elif len(line.strip()) < 20: + pattern += 'S' # Short + elif re.search(r'["\"\'""''γγγγ]', line): + pattern += 'D' # Dialogue + else: + pattern += 'P' # Paragraph + + # Calculate average paragraph length + paragraphs = [l for l in lines if len(l.strip()) > 50] + avg_para_length = sum(len(p) for p in paragraphs) / max(1, len(paragraphs)) if paragraphs else 0 + + # Dialogue ratio + dialogue_ratio = dialogue_lines / max(1, len(lines)) + + signature = { + 'pattern': pattern, + 'paragraph_count': para_count, + 'avg_paragraph_length': avg_para_length, + 'dialogue_ratio': dialogue_ratio, + 'short_lines': short_lines, + 'empty_lines': empty_lines + } + + return signature + +def extract_content_fingerprint_impl(text): + """Extract key sentences that can identify duplicate content""" + lines = [line.strip() for line in text.split('\n') + if len(line.strip()) > 50 and not is_dash_separator_line(line)] + + if len(lines) < 5: + return "" + + # Take first, middle, and last substantial sentences + fingerprint_lines = [] + if len(lines) >= 3: + fingerprint_lines = [lines[0], lines[len(lines)//2], lines[-1]] + else: + fingerprint_lines = lines[:3] + + return ' '.join(fingerprint_lines).lower() + +# Initialize cached versions +extract_semantic_fingerprint = None +extract_structural_signature = None +extract_content_fingerprint = None + +def _apply_caches(): + """Apply LRU cache to functions with current configuration""" + global extract_semantic_fingerprint, extract_structural_signature, extract_content_fingerprint + + # Apply caching with current sizes + extract_semantic_fingerprint = lru_cache(maxsize=get_cache_size("semantic_fingerprint") or 2000)(extract_semantic_fingerprint_impl) + extract_structural_signature = lru_cache(maxsize=get_cache_size("structural_signature") or 2000)(extract_structural_signature_impl) + extract_content_fingerprint = lru_cache(maxsize=get_cache_size("content_fingerprint") or 2000)(extract_content_fingerprint_impl) + +# Apply initial caches +_apply_caches() + +def clear_qa_caches(): + """Clear all QA scanner caches""" + # Clear directly cached functions + if hasattr(normalize_text, 'cache_clear'): + normalize_text.cache_clear() + + if hasattr(generate_content_hashes, 'cache_clear'): + generate_content_hashes.cache_clear() + + if hasattr(calculate_similarity_ratio, 'cache_clear'): + calculate_similarity_ratio.cache_clear() + + # Clear the actual cached implementations + if hasattr(_calculate_semantic_similarity_cached, 'cache_clear'): + _calculate_semantic_similarity_cached.cache_clear() + + if hasattr(_calculate_structural_similarity_cached, 'cache_clear'): + _calculate_structural_similarity_cached.cache_clear() + + if hasattr(calculate_semantic_fingerprint_similarity, 'cache_clear'): + calculate_semantic_fingerprint_similarity.cache_clear() + + if hasattr(extract_semantic_fingerprint, 'cache_clear'): + extract_semantic_fingerprint.cache_clear() + + if hasattr(extract_structural_signature, 'cache_clear'): + extract_structural_signature.cache_clear() + + if hasattr(extract_content_fingerprint, 'cache_clear'): + extract_content_fingerprint.cache_clear() + + if hasattr(_extract_text_from_html_cached, 'cache_clear'): + _extract_text_from_html_cached.cache_clear() + +def get_cache_info(): + """Get cache statistics for all cached functions""" + cache_info = {} + + # For functions that are directly cached + if hasattr(normalize_text, 'cache_info'): + cache_info['normalize_text'] = normalize_text.cache_info() + + if hasattr(generate_content_hashes, 'cache_info'): + cache_info['content_hashes'] = generate_content_hashes.cache_info() + + if hasattr(calculate_similarity_ratio, 'cache_info'): + cache_info['similarity_ratio'] = calculate_similarity_ratio.cache_info() + + # For wrapper functions, use the actual cached implementation + if hasattr(_calculate_semantic_similarity_cached, 'cache_info'): + cache_info['semantic_similarity'] = _calculate_semantic_similarity_cached.cache_info() + + if hasattr(_calculate_structural_similarity_cached, 'cache_info'): + cache_info['structural_similarity'] = _calculate_structural_similarity_cached.cache_info() + + if hasattr(calculate_semantic_fingerprint_similarity, 'cache_info'): + cache_info['semantic_fingerprint_similarity'] = calculate_semantic_fingerprint_similarity.cache_info() + + if hasattr(extract_semantic_fingerprint, 'cache_info'): + cache_info['semantic_fingerprint'] = extract_semantic_fingerprint.cache_info() + + if hasattr(extract_structural_signature, 'cache_info'): + cache_info['structural_signature'] = extract_structural_signature.cache_info() + + if hasattr(extract_content_fingerprint, 'cache_info'): + cache_info['content_fingerprint'] = extract_content_fingerprint.cache_info() + + if hasattr(_extract_text_from_html_cached, 'cache_info'): + cache_info['file_extraction'] = _extract_text_from_html_cached.cache_info() + + return cache_info + +# For very long texts, we'll use a hash as cache key +def _get_cache_key(text, max_length=10000): + """Generate a cache key for text, using hash for long texts""" + if len(text) > max_length: + return hashlib.md5(text.encode('utf-8')).hexdigest() + return text + +def extract_text_from_html(file_path): + """Extract text from HTML or TXT file + + Returns: + str OR tuple: + - For backwards compatibility: just the text (if not checking HTML structure) + - For new functionality: (text_content, has_html_tag) tuple + """ + # Get file modification time as part of cache key + try: + mtime = os.path.getmtime(file_path) + cache_key = f"{file_path}:{mtime}" + except OSError: + cache_key = file_path + + return _extract_text_from_html_cached(cache_key, file_path) + +def _extract_text_from_html_cached(cache_key, file_path): + """Cached implementation of extract_text_from_html""" + with open(file_path, "r", encoding="utf-8", errors="ignore") as f: + content = f.read() + + # Check if it's a .txt file + if file_path.lower().endswith('.txt'): + # For .txt files, just return the content directly + return content + + # For HTML files, parse with BeautifulSoup + soup = BeautifulSoup(content, "html.parser") + text = soup.get_text(separator='\n', strip=True) + + # For backwards compatibility, we'll handle the HTML tag check separately + # in the scan function rather than always returning a tuple + return text + +# Configure cache size dynamically +_extract_text_from_html_cached = lru_cache(maxsize=get_cache_size("file_extraction") or 200)(_extract_text_from_html_cached) + +import re + +def check_html_structure(file_path): + """Check if an HTML file has proper HTML tags""" + if not file_path.lower().endswith(('.html', '.xhtml', '.htm')): + return True + + with open(file_path, "r", encoding="utf-8", errors="ignore") as f: + content = f.read() + + html_tags = [ + '', '
20] + + if len(sentences) < min_repeats: + return False + + counter = Counter(sentences) + + for sent, count in counter.items(): + if count >= min_repeats and len(sent) > 50: + if not any(pattern in sent.lower() for pattern in ['said', 'asked', 'replied', 'thought']): + return True + return False + +def is_korean_separator_pattern(text, excluded_chars=None): + """Check if text is a Korean separator pattern like [γ ‘γ ‘γ ‘γ ‘γ ‘]""" + if excluded_chars is None: + excluded_chars = KOREAN_SEPARATOR_CHARS + + # Remove brackets and spaces + cleaned = text.strip().strip('[]').strip() + + if not cleaned: + return False + + # Check if all characters are separators or excluded characters + return all(c in excluded_chars or c.isspace() for c in cleaned) + +def detect_non_english_content(text, qa_settings=None): + """Detect ONLY non-Latin script characters (not romanized text), excluding Korean separators""" + if qa_settings is None: + qa_settings = {'foreign_char_threshold': 10, 'excluded_characters': ''} + + # Get threshold and excluded characters + threshold = qa_settings.get('foreign_char_threshold', 10) + excluded_chars = set() + if qa_settings.get('excluded_characters'): + excluded_chars = set(qa_settings['excluded_characters'].split()) + + # Combine with existing separator chars + all_excluded_chars = KOREAN_SEPARATOR_CHARS.copy() + all_excluded_chars.update(excluded_chars) + + issues = [] + filtered_text = filter_dash_lines(text) + + # Define non-Latin script ranges + non_latin_ranges = [ + (0xAC00, 0xD7AF, 'Korean'), (0x1100, 0x11FF, 'Korean'), + (0x3130, 0x318F, 'Korean'), (0xA960, 0xA97F, 'Korean'), + (0xD7B0, 0xD7FF, 'Korean'), (0x3040, 0x309F, 'Japanese'), + (0x30A0, 0x30FF, 'Japanese'), (0x31F0, 0x31FF, 'Japanese'), + (0xFF65, 0xFF9F, 'Japanese'), (0x4E00, 0x9FFF, 'Chinese'), + (0x3400, 0x4DBF, 'Chinese'), (0x20000, 0x2A6DF, 'Chinese'), + (0x2A700, 0x2B73F, 'Chinese'), (0x0590, 0x05FF, 'Hebrew'), + (0x0600, 0x06FF, 'Arabic'), (0x0700, 0x074F, 'Syriac'), + (0x0750, 0x077F, 'Arabic'), (0x0E00, 0x0E7F, 'Thai'), + (0x0400, 0x04FF, 'Cyrillic'), (0x0500, 0x052F, 'Cyrillic'), + ] + + script_chars = {} + total_non_latin = 0 + + # Split text into potential separator patterns and other content + separator_pattern = r'\[[γ ‘\sβββ\-οΌ»οΌ½γγγγγγοΏ½οΏ½οΏ½γγγ]+\]' + parts = re.split(f'({separator_pattern})', filtered_text) + + for part in parts: + # Skip if this part is a Korean separator pattern + if is_korean_separator_pattern(part, all_excluded_chars): + continue + + # Check characters in this part + for char in part: + # Skip characters in excluded set + if char in all_excluded_chars: + continue + + # Skip whitespace and common punctuation + if char.isspace() or char in '[](){}.,;:!?\'"-': + continue + + code_point = ord(char) + for start, end, script_name in non_latin_ranges: + if start <= code_point <= end: + total_non_latin += 1 + if script_name not in script_chars: + script_chars[script_name] = {'count': 0, 'examples': []} + script_chars[script_name]['count'] += 1 + if len(script_chars[script_name]['examples']) < 10: + script_chars[script_name]['examples'].append(char) + break + + # Check against threshold + if total_non_latin > threshold: + for script, data in script_chars.items(): + examples = ''.join(data['examples'][:5]) + count = data['count'] + issues.append(f"{script}_text_found_{count}_chars_[{examples}]") + + return len(issues) > 0, issues + +def detect_translation_artifacts(text): + """Detect common translation/OCR artifacts""" + artifacts_found = [] + + for artifact_type, pattern in TRANSLATION_ARTIFACTS.items(): + matches = pattern.findall(text) + if matches: + artifacts_found.append({ + 'type': artifact_type, + 'count': len(matches), + 'examples': list(set(matches))[:3] + }) + + return artifacts_found + +def detect_glossary_leakage(text, threshold=2): + """ + Detect if translated text contains raw glossary entries. + + Args: + text: The translated text to check + threshold: Minimum number of glossary-like patterns to flag as leakage + + Returns: + tuple: (has_leakage, details) + """ + import re + + issues_found = [] + + # Check for CSV-style glossary headers + csv_header_pattern = re.compile( + r'type\s*,\s*raw_name\s*,\s*translated_name\s*,\s*gender\s*,\s*description', + re.IGNORECASE + ) + if csv_header_pattern.search(text): + issues_found.append({ + 'type': 'csv_header', + 'severity': 'critical', + 'description': 'Found CSV glossary header in translation' + }) + + # Check for multiple structured entries + entry_patterns = [ + # JSON-like entries + (r'\{\s*"type"\s*:\s*"[^"]+"\s*,\s*"raw_name"\s*:\s*"[^"]+"\s*,', 'json_entry'), + # CSV-like entries with Korean/Chinese characters + (r'(?:character|term)\s*,\s*[κ°-ν£\u4e00-\u9fff]+\s*,\s*[A-Za-z\s]+\s*,', 'csv_entry'), + # Tab-separated entries + (r'(?:character|term)\t[κ°-ν£\u4e00-\u9fff]+\t[A-Za-z\s]+\t', 'tsv_entry'), + ] + + for pattern_str, pattern_type in entry_patterns: + pattern = re.compile(pattern_str, re.IGNORECASE) + matches = pattern.findall(text) + if len(matches) >= threshold: + issues_found.append({ + 'type': pattern_type, + 'severity': 'high', + 'count': len(matches), + 'examples': matches[:3], + 'description': f'Found {len(matches)} {pattern_type} glossary entries' + }) + + # Check for repeated glossary field names + field_names = ['type', 'raw_name', 'translated_name', 'gender', 'description'] + field_count = sum(1 for field in field_names if text.lower().count(field) >= 3) + if field_count >= 3: + issues_found.append({ + 'type': 'repeated_field_names', + 'severity': 'medium', + 'description': f'Found {field_count} repeated glossary field names' + }) + + # Check for specific character/term patterns + char_term_pattern = re.compile( + r'(?:^|\n)\s*(?:character|term)\s*[,:\t]\s*[^\n]+(?:Male|Female|A\s+historical|Former\s+mayor|Character\s+from)', + re.IGNORECASE | re.MULTILINE + ) + char_matches = char_term_pattern.findall(text) + if len(char_matches) >= 2: + issues_found.append({ + 'type': 'character_definitions', + 'severity': 'high', + 'count': len(char_matches), + 'examples': char_matches[:2], + 'description': f'Found {len(char_matches)} character/term definitions' + }) + + has_leakage = len(issues_found) > 0 + + return has_leakage, issues_found + +def extract_semantic_fingerprint(text): + """Extract semantic fingerprint and signature from text - CACHED VERSION""" + # For cache efficiency with long texts + cache_text = text[:50000] if len(text) > 50000 else text + + # Extract features for semantic analysis + words = cache_text.lower().split() + + # Character names (words starting with capital letters, appearing multiple times) + potential_names = re.findall(r'\b[A-Z][a-z]+\b', cache_text) + name_freq = Counter(potential_names) + characters = [name for name, count in name_freq.items() + if count >= 3 and name not in COMMON_WORDS] + + # Dialogue analysis + dialogue_matches = re.findall(r'["\"\'""''γγγγ]([^"\"\'""''γγγγ]+)["\"\'""''γγγγ]', cache_text) + dialogue_count = len(dialogue_matches) + dialogue_density = dialogue_count / max(1, len(words)) if words else 0 + dialogue_lengths = [len(d) for d in dialogue_matches[:30]] # First 30 dialogue lengths + + # Character frequencies (sorted list) + character_frequencies = [count for _, count in name_freq.most_common()] + + # Speaker sequence extraction + speaker_patterns = re.findall(r'(\w+)\s+(?:said|asked|replied|shouted|whispered|spoke)', cache_text.lower()) + speaker_sequence = speaker_patterns[:50] # First 50 speakers + + # Paragraph structure (lengths of each paragraph) + paragraphs = [p for p in cache_text.split('\n\n') if p.strip()] + paragraph_structure = [len(p) for p in paragraphs[:50]] # First 50 paragraph lengths + + # Action words density + action_words = len(re.findall(r'\b(\w+ed|spoke|says?|asks?|replies?|shouts?|screams?|whispers?)\b', cache_text)) + action_density = action_words / max(1, len(words)) if words else 0 + + # Numbers in text + numbers = re.findall(r'\b\d+\b', cache_text) + + # Create fingerprint string + fingerprint = f"chars:{len(characters)}_dial:{dialogue_density:.2f}_act:{action_density:.2f}_nums:{len(numbers)}_words:{len(words)}" + + # Create signature dict + signature = { + 'characters': characters[:20], # Top 20 characters + 'dialogue_density': dialogue_density, + 'dialogue_count': dialogue_count, + 'dialogue_lengths': dialogue_lengths, + 'character_frequencies': character_frequencies, + 'speaker_sequence': speaker_sequence, + 'paragraph_structure': paragraph_structure, + 'total_words': len(words), + 'action_density': action_density, + 'numbers': numbers[:50], # First 50 numbers + 'text_length': len(cache_text) + } + + return fingerprint, signature + +# Apply dynamic caching +extract_semantic_fingerprint = lru_cache(maxsize=get_cache_size("semantic_fingerprint") or 2000)(extract_semantic_fingerprint) + +def extract_structural_signature(text): + """Extract structural patterns from text - CACHED VERSION""" + # For cache efficiency with long texts + cache_text = text[:50000] if len(text) > 50000 else text + + lines = cache_text.split('\n') + + # Count different types of lines + para_count = len([l for l in lines if len(l.strip()) > 50]) + short_lines = len([l for l in lines if 0 < len(l.strip()) < 20]) + empty_lines = len([l for l in lines if not l.strip()]) + + # Dialogue patterns + dialogue_lines = len(re.findall(r'["\"\'""''γγγγ].*?["\"\'""''γγγγ]', cache_text)) + + # Create pattern string (first letter of each line type) + pattern = '' + for line in lines[:100]: # First 100 lines + if not line.strip(): + pattern += 'E' # Empty + elif len(line.strip()) < 20: + pattern += 'S' # Short + elif re.search(r'["\"\'""''γγγγ]', line): + pattern += 'D' # Dialogue + else: + pattern += 'P' # Paragraph + + # Calculate average paragraph length + paragraphs = [l for l in lines if len(l.strip()) > 50] + avg_para_length = sum(len(p) for p in paragraphs) / max(1, len(paragraphs)) if paragraphs else 0 + + # Dialogue ratio + dialogue_ratio = dialogue_lines / max(1, len(lines)) + + signature = { + 'pattern': pattern, + 'paragraph_count': para_count, + 'avg_paragraph_length': avg_para_length, + 'dialogue_ratio': dialogue_ratio, + 'short_lines': short_lines, + 'empty_lines': empty_lines + } + + return signature + +def extract_content_fingerprint(text): + """Extract key sentences that can identify duplicate content - CACHED VERSION""" + # For cache efficiency with very long texts, limit to first 100KB + cache_text = text[:100000] if len(text) > 100000 else text + + lines = [line.strip() for line in cache_text.split('\n') + if len(line.strip()) > 50 and not is_dash_separator_line(line)] + + if len(lines) < 5: + return "" + + # Take first, middle, and last substantial sentences + fingerprint_lines = [] + if len(lines) >= 3: + fingerprint_lines = [lines[0], lines[len(lines)//2], lines[-1]] + else: + fingerprint_lines = lines[:3] + + return ' '.join(fingerprint_lines).lower() + +# Configure cache size dynamically +extract_content_fingerprint = lru_cache(maxsize=get_cache_size("content_fingerprint"))(extract_content_fingerprint) + +def roman_to_int(s): + """Convert Roman numerals to integer""" + try: + values = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} + result = 0 + for i in range(len(s)): + if i + 1 < len(s) and values[s[i]] < values[s[i + 1]]: + result -= values[s[i]] + else: + result += values[s[i]] + return result + except: + return None + +def extract_chapter_info(filename, text): + """Extract chapter number and title from filename and content - ENHANCED VERSION""" + chapter_num = None + chapter_title = "" + + # Enhanced filename patterns - try multiple approaches + filename_patterns = [ + # Original patterns + (r"response_(\d+)_(.+?)\.html", 1, 2), + (r"response_chapter(\d+)\.html", 1, None), + (r"chapter[\s_-]*(\d+)", 1, None), + + # New patterns to catch more cases + (r"response_(\d{3,4})_", 1, None), # Catches response_003_ + (r"response_chapter(\d{4})\.html", 1, None), # Catches response_chapter0002 + (r"(\d{3,4})[_\.]", 1, None), # General 3-4 digit pattern + (r"No(\d+)Chapter", 1, None), + (r"ch[\s_-]*(\d+)", 1, None), + (r"_(\d+)_", 1, None), + (r"第(\d+)[η« θ―ε]", 1, None), # Chinese chapter markers + (r"μ (\d+)[μ₯νν]", 1, None), # Korean chapter markers + ] + + # Try each pattern + for pattern, num_group, title_group in filename_patterns: + m = re.search(pattern, filename, re.IGNORECASE) + if m: + try: + # Extract chapter number, removing leading zeros + chapter_num = int(m.group(num_group).lstrip('0') or '0') + if title_group and len(m.groups()) >= title_group: + chapter_title = m.group(title_group) + break + except (ValueError, IndexError): + continue + + # If still no chapter number, try content-based extraction + if chapter_num is None and text: + content_patterns = [ + r'Chapter\s+(\d+)', + r'第\s*(\d+)\s*η« ', + r'μ \s*(\d+)\s*μ₯', + r'Chapter\s+([IVXLCDM]+)', # Roman numerals + r'\bCh\.?\s*(\d+)', + r'Episode\s+(\d+)', + r'Part\s+(\d+)', + ] + + for pattern in content_patterns: + m = re.search(pattern, text[:1000], re.IGNORECASE) + if m: + if m.group(1).isdigit(): + chapter_num = int(m.group(1)) + else: + # Try to convert Roman numerals + num = roman_to_int(m.group(1)) + if num is not None: + chapter_num = num + if chapter_num is not None: + break + + return chapter_num, chapter_title + +def normalize_chapter_numbers(results): + """Normalize chapter numbers to handle different formats""" + for result in results: + # If we have a chapter number, ensure it's normalized + if result.get('chapter_num') is not None: + # This helps match chapter 2 with 002, etc. + result['normalized_chapter_num'] = int(result['chapter_num']) + +def fuzzy_match_chapter_numbers(text1, text2, num1, num2): + """Check if chapter numbers might be the same despite OCR errors""" + if num1 == num2: + return True + + # Check if numbers are close (OCR might misread) + if abs(num1 - num2) <= 1: + # Look for chapter declarations in text + pattern = r'Chapter\s*(\d+|[IVXLCDM]+)' + matches1 = re.findall(pattern, text1[:500], re.IGNORECASE) + matches2 = re.findall(pattern, text2[:500], re.IGNORECASE) + + if matches1 and matches2: + # Try to normalize roman numerals + def roman_to_int(s): + try: + values = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} + result = 0 + for i in range(len(s)): + if i + 1 < len(s) and values[s[i]] < values[s[i + 1]]: + result -= values[s[i]] + else: + result += values[s[i]] + return result + except: + return None + + for m1 in matches1: + for m2 in matches2: + if m1.isdigit() and m2.isdigit(): + if abs(int(m1) - int(m2)) <= 1: + return True + elif not m1.isdigit() and not m2.isdigit(): + r1 = roman_to_int(m1.upper()) + r2 = roman_to_int(m2.upper()) + if r1 and r2 and abs(r1 - r2) <= 1: + return True + + return False + +def detect_split_chapters(results): + """Detect chapters that might have been split into multiple files + Now with better detection to avoid false positives from intentional author formatting + """ + split_candidates = [] + + # Common scene break patterns that authors use intentionally + scene_break_patterns = [ + r'[\*\s]{3,}', # *** or * * * + r'[ββοΌβ\-]{3,}', # Various dashes/lines + r'[_]{3,}', # ___ + r'[~ο½]{3,}', # ~~~ + r'[=]{3,}', # === + r'[\#]{3,}', # ### + r'[\.]{3,}', # ... + r'(?:Chapter|Scene|Part)\s+Break', # Explicit break text + r'(?:Meanwhile|Later|Earlier)', # Time transition words + r'\d+\s*(?:hours?|days?|weeks?|months?|years?)\s+(?:later|earlier|ago)', # Time skips + ] + + for i, result in enumerate(results): + text = result.get('raw_text', '') + filename = result.get('filename', '') + + # Skip if empty + if not text.strip(): + continue + + # Check for continuation indicators from AI + artifacts = detect_translation_artifacts(text) + has_continuation = any(a['type'] in ['chapter_continuation', 'split_indicators'] + for a in artifacts) + + # Check file naming patterns that suggest systematic splits + is_systematic_split = False + split_patterns = [ + r'chunk[\-_]?\d+', # chunk1, chunk_2 + r'part[\-_]?\d+[\-_]?\d+', # part1_2 (part 1 of chapter 2) + r'response_\d+_\d+', # response_42_3 + r'_\d+of\d+', # _1of3 + r'_split\d+', # _split1 + r'_continuation', # _continuation + ] + for pattern in split_patterns: + if re.search(pattern, filename, re.IGNORECASE): + is_systematic_split = True + break + + # Check if file is unusually short + is_short = len(text) < 2000 + + # Check for scene break indicators at start or end + text_start = text[:500].strip() + text_end = text[-500:].strip() + + has_scene_break_start = False + has_scene_break_end = False + + for pattern in scene_break_patterns: + if re.search(pattern, text_start[:100], re.IGNORECASE): + has_scene_break_start = True + if re.search(pattern, text_end[-100:], re.IGNORECASE): + has_scene_break_end = True + + # Check if starts mid-sentence (but not after scene break) + starts_mid = False + if text.strip() and not has_scene_break_start: + first_line = text.strip().split('\n')[0].strip() + # Skip if line starts with dialogue quotes or chapter markers + if first_line and not re.match(r'^["γγ\(\[]', first_line): + # Check if starts with lowercase (excluding certain words that commonly start sections) + first_word = first_line.split()[0] if first_line.split() else '' + transition_words = ['meanwhile', 'however', 'suddenly', 'later', 'earlier', + 'elsewhere', 'afterward', 'afterwards', 'then'] + if first_word.lower() not in transition_words: + starts_mid = first_line[0].islower() + + # Check if ends mid-sentence (but not with scene break) + ends_mid = False + if text.strip() and not has_scene_break_end: + last_line = text.strip().split('\n')[-1].strip() + if last_line: + # Check last character, ignoring quotes + last_char = last_line.rstrip('γγ"\'').rstrip() + if last_char: + ends_mid = last_char[-1] not in '.!?γοΌοΌβ¦' + + # Determine if this is likely a real split vs intentional formatting + is_likely_real_split = False + + if is_systematic_split: + # File naming strongly suggests a split + is_likely_real_split = True + elif has_continuation: + # AI detected continuation markers + is_likely_real_split = True + elif is_short and starts_mid and ends_mid and not (has_scene_break_start or has_scene_break_end): + # Short, starts and ends mid-sentence, no scene breaks + is_likely_real_split = True + elif is_short and ends_mid and not has_scene_break_end: + # Might be a split if it's short and ends abruptly + # Check if it ends with incomplete dialogue or mid-word + if text.strip(): + # Check for incomplete quotes or mid-word breaks + if (text.count('"') % 2 != 0 or text.count('γ') != text.count('γ') or + re.search(r'[a-zA-Z]-$', text.strip())): # Ends with hyphen (mid-word) + is_likely_real_split = True + + if is_likely_real_split: + split_candidates.append({ + 'index': i, + 'filename': filename, + 'indicators': { + 'has_continuation': has_continuation, + 'is_systematic_split': is_systematic_split, + 'is_short': is_short, + 'starts_mid': starts_mid, + 'ends_mid': ends_mid, + 'has_scene_break_start': has_scene_break_start, + 'has_scene_break_end': has_scene_break_end + } + }) + + return split_candidates + +def create_minhash_index(results, config): + """Create LSH index for fast similarity lookups""" + if not MINHASH_AVAILABLE: + return None, None + + threshold = config.get_threshold('minhash_threshold') + lsh = MinHashLSH(threshold=threshold, num_perm=128) + minhashes = {} + + total = len(results) + for idx, result in enumerate(results): + if idx % 50 == 0 and idx > 0: + print(f" Building MinHash index: {idx}/{total} files processed...") + + text = result.get('normalized_text', '') + if not text: + continue + + # Create MinHash + m = MinHash(num_perm=128) + for word in text.split(): + m.update(word.encode('utf8')) + + minhashes[result['filename']] = m + lsh.insert(result['filename'], m) + + return lsh, minhashes + +def _normalize_text_cached(cache_key): + """Cached implementation of normalize_text""" + # This will be called with the actual text + return cache_key + +def normalize_text(text): + """Normalize text for comparison - CACHED VERSION""" + normalized = text.lower().strip() + + # Remove chapter indicators + patterns = [ + r'chapter\s*\d+\s*:?\s*', r'第\s*\d+\s*η« ', r'μ \s*\d+\s*μ₯', + r'chapter\s+[ivxlcdm]+\s*:?\s*', r'\bch\.?\s*\d+\s*:?\s*', + r'^\s*\d+\s*\.?\s*', r'response_\d+_.*?\.html', + r'\d{4}-\d{2}-\d{2}', r'\d{2}:\d{2}:\d{2}', r'<[^>]+>' + ] + + for pattern in patterns: + normalized = re.sub(pattern, '', normalized, flags=re.IGNORECASE | re.MULTILINE) + + # Normalize whitespace and punctuation + normalized = re.sub(r'\s+', ' ', normalized) + normalized = re.sub(r'[^\w\s]', '', normalized) + + return normalized + +# Configure cache size dynamically +normalize_text = lru_cache(maxsize=get_cache_size("normalize_text"))(normalize_text) + +@lru_cache(maxsize=5000) +def _generate_content_hashes_cached(text_hash): + """Cached helper for generate_content_hashes""" + # This is just a placeholder - actual implementation is in the main function + return text_hash + +@lru_cache(maxsize=5000) +def generate_content_hashes(text): + """Generate multiple hashes for better duplicate detection - CACHED VERSION""" + # For very long texts, use first 50KB for cache key + cache_key = _get_cache_key(text, 50000) + + normalized = normalize_text(text) + + # 1. Raw hash + raw_hash = hashlib.md5(text.encode('utf-8')).hexdigest() + + # 2. Normalized hash + normalized_hash = hashlib.md5(normalized.encode('utf-8')).hexdigest() + + # 3. Content fingerprint + fingerprint = extract_content_fingerprint(text) + fingerprint_hash = hashlib.md5(fingerprint.encode('utf-8')).hexdigest() if fingerprint else None + + # 4. Word frequency hash + words = re.findall(r'\w+', normalized.lower()) + word_freq = Counter(words) + significant_words = [(w, c) for w, c in word_freq.most_common(100) + if w not in COMMON_WORDS][:50] + word_sig = ' '.join([f"{w}:{c}" for w, c in significant_words]) + word_hash = hashlib.md5(word_sig.encode('utf-8')).hexdigest() if word_sig else None + + # 5. First chunk hash + first_chunk = normalized[:1000] if len(normalized) > 1000 else normalized + first_chunk_hash = hashlib.md5(first_chunk.encode('utf-8')).hexdigest() + + # 6. Semantic fingerprint hash - FIXED + semantic_result = extract_semantic_fingerprint(text) + if semantic_result and isinstance(semantic_result, tuple) and len(semantic_result) >= 2: + semantic_str = semantic_result[0] + semantic_hash = hashlib.md5(semantic_str.encode('utf-8')).hexdigest() + else: + # Fallback if function returns unexpected value + semantic_hash = hashlib.md5(text[:1000].encode('utf-8')).hexdigest() + + # 7. Structural signature hash + structural_sig = extract_structural_signature(text) + if structural_sig: + structural_str = json.dumps(structural_sig, sort_keys=True) + structural_hash = hashlib.md5(structural_str.encode('utf-8')).hexdigest() + else: + # Fallback + structural_hash = hashlib.md5(text[:500].encode('utf-8')).hexdigest() + + return { + 'raw': raw_hash, + 'normalized': normalized_hash, + 'fingerprint': fingerprint_hash, + 'word_freq': word_hash, + 'first_chunk': first_chunk_hash, + 'semantic': semantic_hash, + 'structural': structural_hash + } + +@lru_cache(maxsize=20000) +def _calculate_similarity_ratio_cached(text1_hash, text2_hash): + """Cached helper for similarity ratio""" + return (text1_hash, text2_hash) + +@lru_cache(maxsize=20000) +def calculate_similarity_ratio(text1, text2): + """Calculate similarity with optimizations for large texts - CACHED VERSION""" + # Ensure consistent ordering for cache + if text1 > text2: + text1, text2 = text2, text1 + + len_ratio = len(text1) / max(1, len(text2)) + if len_ratio < 0.7 or len_ratio > 1.3: + return 0.0 + + if len(text1) > 10000: + sample_size = 3000 + samples1 = [ + text1[:sample_size], + text1[len(text1)//2 - sample_size//2:len(text1)//2 + sample_size//2], + text1[-sample_size:] + ] + samples2 = [ + text2[:sample_size], + text2[len(text2)//2 - sample_size//2:len(text2)//2 + sample_size//2], + text2[-sample_size:] + ] + similarities = [SequenceMatcher(None, s1, s2).ratio() for s1, s2 in zip(samples1, samples2)] + return sum(similarities) / len(similarities) + else: + return SequenceMatcher(None, text1, text2).ratio() + +# Configure cache size dynamically +calculate_similarity_ratio = lru_cache(maxsize=get_cache_size("similarity_ratio"))(calculate_similarity_ratio) + +# This function should NOT be cached directly +def calculate_semantic_similarity(sig1, sig2): + """Calculate similarity between two semantic signatures + This wrapper handles dict inputs and calls the cached implementation + """ + # Convert dicts to JSON strings + if isinstance(sig1, dict): + sig1_json = json.dumps(sig1, sort_keys=True) + else: + sig1_json = sig1 + + if isinstance(sig2, dict): + sig2_json = json.dumps(sig2, sort_keys=True) + else: + sig2_json = sig2 + + # Call the cached implementation with JSON strings + return _calculate_semantic_similarity_cached(sig1_json, sig2_json) + +# This function IS cached because it only receives JSON strings +def _calculate_semantic_similarity_cached(sig1_json, sig2_json): + """Cached implementation that works with JSON strings""" + sig1 = json.loads(sig1_json) + sig2 = json.loads(sig2_json) + + # Character overlap + chars1 = set(sig1.get('characters', [])) + chars2 = set(sig2.get('characters', [])) + char_overlap = len(chars1 & chars2) / max(1, len(chars1 | chars2)) + + # Dialogue density similarity + dial_sim = 1 - abs(sig1.get('dialogue_density', 0) - sig2.get('dialogue_density', 0)) + + # Action density similarity + act_sim = 1 - abs(sig1.get('action_density', 0) - sig2.get('action_density', 0)) + + # Number overlap + nums1 = set(sig1.get('numbers', [])) + nums2 = set(sig2.get('numbers', [])) + num_overlap = len(nums1 & nums2) / max(1, len(nums1 | nums2)) if nums1 or nums2 else 1 + + # Length similarity + len_ratio = min(sig1.get('text_length', 1), sig2.get('text_length', 1)) / max(1, max(sig1.get('text_length', 1), sig2.get('text_length', 1))) + + # Weighted average + return (char_overlap * 0.4 + dial_sim * 0.2 + act_sim * 0.2 + num_overlap * 0.1 + len_ratio * 0.1) + +# Apply caching ONLY to the implementation function, NOT the wrapper +_calculate_semantic_similarity_cached = lru_cache(maxsize=get_cache_size("semantic_similarity") or 5000)(_calculate_semantic_similarity_cached) + +# Make sure calculate_semantic_similarity is NOT cached +# If there's any line like this, REMOVE IT: +# calculate_semantic_similarity = lru_cache(...)(calculate_semantic_similarity) + + +def calculate_semantic_fingerprint_similarity(text1, text2): + """Calculate similarity based on semantic structure rather than exact wording - CACHED VERSION""" + # For very long texts, truncate for cache efficiency + cache_text1 = text1[:100000] if len(text1) > 100000 else text1 + cache_text2 = text2[:100000] if len(text2) > 100000 else text2 + + fingerprint1, sig1 = extract_semantic_fingerprint(cache_text1) + fingerprint2, sig2 = extract_semantic_fingerprint(cache_text2) + + similarities = [] + + # Compare dialogue structure (very reliable indicator) + if sig1['dialogue_count'] > 0 and sig2['dialogue_count'] > 0: + dialogue_ratio = min(sig1['dialogue_count'], sig2['dialogue_count']) / max(sig1['dialogue_count'], sig2['dialogue_count']) + similarities.append(dialogue_ratio) + + # Compare dialogue length patterns + if sig1['dialogue_lengths'] and sig2['dialogue_lengths']: + len_similarity = SequenceMatcher(None, sig1['dialogue_lengths'][:30], sig2['dialogue_lengths'][:30]).ratio() + similarities.append(len_similarity) + + # Compare character lists (names should mostly match) + if sig1['characters'] and sig2['characters']: + char_set1 = set(sig1['characters']) + char_set2 = set(sig2['characters']) + char_overlap = len(char_set1 & char_set2) / max(len(char_set1), len(char_set2)) + similarities.append(char_overlap) + + # Compare character frequency patterns + freq_similarity = SequenceMatcher(None, sig1['character_frequencies'], sig2['character_frequencies']).ratio() + similarities.append(freq_similarity * 0.8) # Slightly less weight + + # Compare numbers (very reliable - numbers rarely change) + if sig1['numbers'] and sig2['numbers']: + num_set1 = set(sig1['numbers']) + num_set2 = set(sig2['numbers']) + num_overlap = len(num_set1 & num_set2) / max(len(num_set1), len(num_set2)) + similarities.append(num_overlap) + + # Compare speaker sequences + if len(sig1['speaker_sequence']) >= 5 and len(sig2['speaker_sequence']) >= 5: + seq_similarity = SequenceMatcher(None, sig1['speaker_sequence'], sig2['speaker_sequence']).ratio() + similarities.append(seq_similarity) + + # Compare paragraph structure + if len(sig1['paragraph_structure']) >= 10 and len(sig2['paragraph_structure']) >= 10: + # Allow for some variation in lengths (Β±20%) + para_similarities = [] + for i in range(min(len(sig1['paragraph_structure']), len(sig2['paragraph_structure']))): + len1 = sig1['paragraph_structure'][i] + len2 = sig2['paragraph_structure'][i] + if len1 > 0 and len2 > 0: + ratio = min(len1, len2) / max(len1, len2) + para_similarities.append(1.0 if ratio > 0.8 else ratio) + + if para_similarities: + similarities.append(sum(para_similarities) / len(para_similarities)) + + # Word count ratio (should be similar) + word_ratio = min(sig1['total_words'], sig2['total_words']) / max(sig1['total_words'], sig2['total_words']) + similarities.append(word_ratio * 0.5) # Less weight + + # Calculate weighted average + if similarities: + return sum(similarities) / len(similarities) + else: + return 0.0 + +# Configure cache size dynamically +calculate_semantic_fingerprint_similarity = lru_cache(maxsize=get_cache_size("semantic_fingerprint"))(calculate_semantic_fingerprint_similarity) + +# This function should NOT be cached directly - it's the wrapper +def calculate_structural_similarity(struct1, struct2): + """Calculate similarity between two structural signatures + This wrapper handles dict inputs and calls the cached implementation + """ + # Convert dicts to JSON strings + if isinstance(struct1, dict): + struct1_json = json.dumps(struct1, sort_keys=True) + else: + struct1_json = struct1 + + if isinstance(struct2, dict): + struct2_json = json.dumps(struct2, sort_keys=True) + else: + struct2_json = struct2 + + # Call the cached implementation with JSON strings + return _calculate_structural_similarity_cached(struct1_json, struct2_json) + +# This function IS cached because it only receives JSON strings +def _calculate_structural_similarity_cached(struct1_json, struct2_json): + """Cached implementation that works with JSON strings""" + # Convert JSON strings back to dictionaries + struct1 = json.loads(struct1_json) + struct2 = json.loads(struct2_json) + + # Pattern similarity + pattern_sim = SequenceMatcher(None, struct1.get('pattern', ''), struct2.get('pattern', '')).ratio() + + # Paragraph count similarity + para_ratio = min(struct1.get('paragraph_count', 1), struct2.get('paragraph_count', 1)) / \ + max(1, max(struct1.get('paragraph_count', 1), struct2.get('paragraph_count', 1))) + + # Average paragraph length similarity + len_ratio = min(struct1.get('avg_paragraph_length', 1), struct2.get('avg_paragraph_length', 1)) / \ + max(1, max(struct1.get('avg_paragraph_length', 1), struct2.get('avg_paragraph_length', 1))) + + # Dialogue ratio similarity + dial_sim = 1 - abs(struct1.get('dialogue_ratio', 0) - struct2.get('dialogue_ratio', 0)) + + # Weighted average + return (pattern_sim * 0.5 + para_ratio * 0.2 + len_ratio * 0.15 + dial_sim * 0.15) + +# Apply caching ONLY to the implementation function, NOT the wrapper +_calculate_structural_similarity_cached = lru_cache(maxsize=get_cache_size("structural_similarity") or 5000)(_calculate_structural_similarity_cached) + +# Note: cache configurations are already applied earlier in the file + +def extract_chapter_title(text): + """Extract chapter title from text""" + patterns = [ + r'Chapter\s+\d+\s*:\s*([^\n\r]+)', + r'Chapter\s+\d+\s+([^\n\r]+)', + r'第\s*\d+\s*η« \s*[:οΌ]?\s*([^\n\r]+)', + r'μ \s*\d+\s*μ₯\s*[:οΌ]?\s*([^\n\r]+)', + ] + + for pattern in patterns: + match = re.search(pattern, text[:500], re.IGNORECASE) + if match: + title = match.group(1).strip() + title = re.sub(r'\s+', ' ', title) + title = title.split('.')[0].split('The')[0].strip() + return title[:100] if len(title) > 100 else title + + return None + +def merge_duplicate_groups(duplicate_groups, filename1, filename2): + """Intelligently merge duplicate groups when new connections are found + + Note: When called from parallel processing, should be wrapped with a lock + """ + group1 = duplicate_groups.get(filename1) + group2 = duplicate_groups.get(filename2) + + if group1 is None and group2 is None: + # Create new group + new_group = max(duplicate_groups.values(), default=-1) + 1 + duplicate_groups[filename1] = new_group + duplicate_groups[filename2] = new_group + elif group1 is not None and group2 is None: + # Add to existing group + duplicate_groups[filename2] = group1 + elif group1 is None and group2 is not None: + # Add to existing group + duplicate_groups[filename1] = group2 + elif group1 != group2: + # Merge two groups + min_group = min(group1, group2) + max_group = max(group1, group2) + for filename, group in duplicate_groups.items(): + if group == max_group: + duplicate_groups[filename] = min_group + + +def process_enhance_duplicate_batch(args): + """Process a batch of enhanced duplicate detection - MUST BE AT MODULE LEVEL""" + batch_type, batch_data, worker_data = args + batch_results = [] + + # Import what we need + from difflib import SequenceMatcher + import hashlib + + # Local caches for this worker + similarity_cache = {} + preview_cache = {} + + if batch_type == 'chapter_comparison': + # Process chapter number group comparisons + comparisons = batch_data + text_data = worker_data['text_data'] + threshold = worker_data['similarity_threshold'] + + for idx1, idx2, file1, file2, chapter_num in comparisons: + # Get text data + data1 = text_data[idx1] + data2 = text_data[idx2] + + # Create cache key (handle None hashes) + if data1['hash'] is None or data2['hash'] is None: + continue # Skip if either file is empty + + cache_key = (min(data1['hash'], data2['hash']), max(data1['hash'], data2['hash'])) + + if cache_key in similarity_cache: + similarity = similarity_cache[cache_key] + else: + # Check if hashes are identical + if data1['hash'] == data2['hash']: + similarity = 1.0 + else: + # Calculate similarity + similarity = calculate_similarity_ratio(data1['text'], data2['text']) + + similarity_cache[cache_key] = similarity + + if similarity >= threshold: + batch_results.append({ + 'type': 'chapter_duplicate', + 'file1': file1, + 'file2': file2, + 'chapter': chapter_num, + 'similarity': similarity, + 'preview1': data1['text'][:100], + 'preview2': data2['text'][:100] + }) + + elif batch_type == 'preview_comparison': + # Process preview-based comparisons + comparisons = batch_data + text_data = worker_data['text_data'] + preview_data = worker_data['preview_data'] + threshold = worker_data['similarity_threshold'] + preview_threshold = worker_data['preview_threshold'] + + for idx1, idx2, file1, file2 in comparisons: + # First check preview similarity + preview1 = preview_data[idx1] + preview2 = preview_data[idx2] + + # Normalize previews (first 50 words) + norm_preview1 = ' '.join(preview1['text'].split()[:50]) + norm_preview2 = ' '.join(preview2['text'].split()[:50]) + + # Check preview similarity (handle None hashes) + if preview1['hash'] is None or preview2['hash'] is None: + continue # Skip if either preview is empty + + preview_cache_key = (min(preview1['hash'], preview2['hash']), + max(preview1['hash'], preview2['hash'])) + + if preview_cache_key in preview_cache: + preview_sim = preview_cache[preview_cache_key] + else: + preview_sim = calculate_similarity_ratio(norm_preview1[:500], norm_preview2[:500]) + preview_cache[preview_cache_key] = preview_sim + + # If previews are similar enough, check full text + if preview_sim >= preview_threshold: + # Get full text data + data1 = text_data[idx1] + data2 = text_data[idx2] + + # Check full text similarity (handle None hashes) + if data1['hash'] is None or data2['hash'] is None: + continue # Skip if either file is empty + + cache_key = (min(data1['hash'], data2['hash']), max(data1['hash'], data2['hash'])) + + if cache_key in similarity_cache: + similarity = similarity_cache[cache_key] + else: + if data1['hash'] == data2['hash']: + similarity = 1.0 + else: + similarity = calculate_similarity_ratio(data1['text'], data2['text']) + + similarity_cache[cache_key] = similarity + + if similarity >= threshold: + batch_results.append({ + 'type': 'misnamed_duplicate', + 'file1': file1, + 'file2': file2, + 'chapter': f"misnamed_{data1.get('chapter_num', '?')}_vs_{data2.get('chapter_num', '?')}", + 'similarity': similarity, + 'preview_similarity': preview_sim + }) + + return batch_results + + +def enhance_duplicate_detection(results, duplicate_groups, duplicate_confidence, config, log, should_stop=None): + """Additional duplicate detection - PROCESSPOOLEXECUTOR VERSION""" + + log("π Enhanced duplicate detection (different naming formats)...") + log("β‘ PROCESSPOOLEXECUTOR ENABLED - MAXIMUM PERFORMANCE!") + + # Determine number of workers + cpu_count = multiprocessing.cpu_count() + max_workers_config = 0 + + try: + config_path = os.path.join(os.path.dirname(__file__), 'config.json') + if os.path.exists(config_path): + with open(config_path, 'r', encoding='utf-8') as f: + full_config = json.load(f) + # Check multiple possible config locations + qa_config = full_config.get('qa_scanner_config', {}) + ai_hunter_config = full_config.get('ai_hunter_config', {}) + + # Priority: qa_scanner_config > ai_hunter_config + max_workers_config = qa_config.get('max_workers', + ai_hunter_config.get('ai_hunter_max_workers', 1)) + except: + max_workers_config = 0 + + if max_workers_config > 0: + max_workers = min(max_workers_config, cpu_count) + log(f" π₯οΈ Using {max_workers} parallel processes (configured limit)") + else: + max_workers = cpu_count + log(f" π Using ALL {max_workers} CPU cores for enhanced detection") + if cpu_count > 8: + log(f" π‘ Tip: You can limit CPU cores in QA scanner settings") + + # Pre-compute all data + log(" π Pre-computing text and preview data...") + + text_data = {} + preview_data = {} + + for i, result in enumerate(results): + # Text data (first 5000 chars) + text = result.get('raw_text', '')[:5000] + text_data[i] = { + 'text': text, + 'hash': hashlib.md5(text.encode()).hexdigest() if text else None, + 'length': len(text), + 'chapter_num': result.get('chapter_num') + } + + # Preview data (first 1000 chars) + preview = result.get('raw_text', '')[:1000].strip() + preview_data[i] = { + 'text': preview, + 'hash': hashlib.md5(preview.encode()).hexdigest() if preview else None + } + + # First, normalize all chapter numbers + normalize_chapter_numbers(results) + + # PART 1: Group by normalized chapter number + log(" π Checking files with same chapter numbers...") + + chapter_groups = {} + for i, result in enumerate(results): + if result.get('normalized_chapter_num') is not None: + num = result['normalized_chapter_num'] + if num not in chapter_groups: + chapter_groups[num] = [] + chapter_groups[num].append((i, result)) + + # Create comparison tasks for chapter groups + chapter_comparisons = [] + for chapter_num, group in chapter_groups.items(): + if len(group) > 1: + log(f" ββ Found {len(group)} files for chapter {chapter_num}") + + # Create all pair comparisons for this group + for i in range(len(group)): + for j in range(i + 1, len(group)): + idx1, result1 = group[i] + idx2, result2 = group[j] + chapter_comparisons.append(( + idx1, idx2, + result1['filename'], result2['filename'], + chapter_num + )) + + # Process chapter comparisons in batches + duplicates_found = [] + + if chapter_comparisons: + log(f" π Processing {len(chapter_comparisons)} chapter comparisons...") + + # Prepare worker data + worker_data = { + 'text_data': text_data, + 'similarity_threshold': config.get_threshold('similarity') + } + + # Create batches + batch_size = max(100, len(chapter_comparisons) // max_workers) + batches = [] + + for i in range(0, len(chapter_comparisons), batch_size): + batch = chapter_comparisons[i:i + batch_size] + batches.append(('chapter_comparison', batch, worker_data)) + + # Process with ProcessPoolExecutor + with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: + futures = [] + + for batch_args in batches: + if should_stop and should_stop(): + log("β Enhanced detection interrupted by user.") + executor.shutdown(wait=True) + return duplicates_found + + future = executor.submit(process_enhance_duplicate_batch, batch_args) + futures.append(future) + + # Collect results + for future in concurrent.futures.as_completed(futures): + batch_results = future.result() + + # Process results + for result in batch_results: + if result['type'] == 'chapter_duplicate': + # Update duplicate groups + with merge_lock: + merge_duplicate_groups(duplicate_groups, + result['file1'], + result['file2']) + pair = tuple(sorted([result['file1'], result['file2']])) + duplicate_confidence[pair] = max( + duplicate_confidence.get(pair, 0), + result['similarity'] + ) + + duplicates_found.append(result) + + log(f" β DUPLICATE: {result['file1']} β {result['file2']} " + f"({int(result['similarity']*100)}%)") + log(f" Preview 1: {result['preview1']}...") + log(f" Preview 2: {result['preview2']}...") + + # PART 2: Check for misnamed files + log("π Checking for misnamed chapters (content vs filename mismatch)...") + + # Create preview-based comparison tasks + preview_comparisons = [] + total_files = len(results) + + # We need to check all pairs, but we can filter some obvious non-matches + for i in range(total_files): + if i % 100 == 0 and i > 0: + log(f" π Creating preview comparisons: {i}/{total_files} files...") + + for j in range(i + 1, total_files): + # Skip if: + # 1. Already in same duplicate group + if (results[i]['filename'] in duplicate_groups and + results[j]['filename'] in duplicate_groups and + duplicate_groups[results[i]['filename']] == duplicate_groups[results[j]['filename']]): + continue + + # 2. Both have same chapter number (already checked above) + if (results[i].get('normalized_chapter_num') is not None and + results[j].get('normalized_chapter_num') is not None and + results[i]['normalized_chapter_num'] == results[j]['normalized_chapter_num']): + continue + + # 3. Text lengths are very different (handle None/empty texts) + len1 = text_data[i]['length'] + len2 = text_data[j]['length'] + if len1 == 0 or len2 == 0: + continue # Skip empty files + + len_ratio = min(len1, len2) / max(len1, len2) + if len_ratio < 0.7: # Skip if lengths differ by more than 30% + continue + + preview_comparisons.append((i, j, results[i]['filename'], results[j]['filename'])) + + if preview_comparisons: + log(f" π Processing {len(preview_comparisons)} preview comparisons...") + + # Prepare worker data + worker_data = { + 'text_data': text_data, + 'preview_data': preview_data, + 'similarity_threshold': config.get_threshold('similarity'), + 'preview_threshold': 0.9 # High threshold for preview matching + } + + # Create batches + batch_size = max(500, len(preview_comparisons) // (max_workers * 10)) + batches = [] + + for i in range(0, len(preview_comparisons), batch_size): + batch = preview_comparisons[i:i + batch_size] + batches.append(('preview_comparison', batch, worker_data)) + + # Process with ProcessPoolExecutor + with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: + futures = [] + + for batch_args in batches: + if should_stop and should_stop(): + log("β Enhanced detection interrupted by user.") + executor.shutdown(wait=True) + return duplicates_found + + future = executor.submit(process_enhance_duplicate_batch, batch_args) + futures.append(future) + + # Collect results with progress + completed = 0 + for future in concurrent.futures.as_completed(futures): + completed += 1 + if completed % 10 == 0: + log(f" π Preview comparison progress: {completed}/{len(futures)} batches") + + batch_results = future.result() + + # Process results + for result in batch_results: + if result['type'] == 'misnamed_duplicate': + # Update duplicate groups + with merge_lock: + merge_duplicate_groups(duplicate_groups, + result['file1'], + result['file2']) + pair = tuple(sorted([result['file1'], result['file2']])) + duplicate_confidence[pair] = max( + duplicate_confidence.get(pair, 0), + result['similarity'] + ) + + duplicates_found.append(result) + + log(f" β Found misnamed duplicate: {result['file1']} β {result['file2']} " + f"({int(result['similarity']*100)}%)") + + log(f"β Enhanced detection complete! Found {len(duplicates_found)} duplicates") + + return duplicates_found + +def detect_duplicates(results, log, should_stop, config): + """Detect duplicates using multiple strategies with enhanced methods - PERFORMANCE OPTIMIZED""" + duplicate_groups = {} + near_duplicate_groups = {} + duplicate_confidence = defaultdict(float) + + total_files = len(results) + dup_start_time = time.time() # Track timing for progress estimates + # Initialize comparisons_done at the function level + comparisons_done = 0 + + # Create local cached functions for this detection run + @lru_cache(maxsize=10000) + def compare_texts_cached(text1_hash, text2_hash, max_length=2000): + """Cached text comparison""" + # Find texts by hash + text1, text2 = None, None + for result in results: + text = result.get('raw_text', '')[:max_length] + text_hash = hashlib.md5(text.encode()).hexdigest() + if text_hash == text1_hash: + text1 = text + if text_hash == text2_hash: + text2 = text + + if text1 and text2: + return calculate_similarity_ratio(text1, text2) + return 0.0 + + # Pre-compute text hashes for caching + text_hashes = {} + for idx, result in enumerate(results): + text = result.get('raw_text', '') + text_hashes[idx] = { + 'hash_2k': hashlib.md5(text[:2000].encode()).hexdigest() if len(text) >= 2000 else None, + 'hash_5k': hashlib.md5(text[:5000].encode()).hexdigest() if len(text) >= 5000 else None, + 'full_text': text + } + + # Extract additional signatures for all results + log("π Extracting semantic and structural signatures...") + for idx, result in enumerate(results): + if should_stop(): + log("β Signature extraction interrupted by user.") + return duplicate_groups, near_duplicate_groups, duplicate_confidence + + if idx % 10 == 0: + progress = int((idx / total_files) * 100) + log(f" π Progress: {idx}/{total_files} files ({progress}%)") + + text = result.get('raw_text', '') + _, semantic_sig = extract_semantic_fingerprint(text) + structural_sig = extract_structural_signature(text) + result['semantic_sig'] = semantic_sig + result['structural_sig'] = structural_sig + result['normalized_text'] = normalize_text(text) + + # Create MinHash index if available + lsh, minhashes = None, None + if MINHASH_AVAILABLE and len(results) > 50: # Use MinHash for larger datasets + log("π Building MinHash index for fast similarity detection...") + lsh, minhashes = create_minhash_index(results, config) + + # 1. Hash-based detection (exact and near-exact matches) + content_hashes = defaultdict(lambda: defaultdict(list)) + + for idx, result in enumerate(results): + hashes = result['hashes'] + file_info = { + 'filename': result['filename'], + 'idx': idx, + 'chapter_num': result['chapter_num'], + 'result': result + } + + for hash_type, hash_value in hashes.items(): + if hash_value: + content_hashes[hash_type][hash_value].append(file_info) + + # Multiple levels of duplicate detection + duplicate_detection_levels = [ + ("exact content", 'raw', 1.0), + ("normalized content", 'normalized', 0.95), + ("semantic fingerprint", 'semantic', 0.85), + ("structural pattern", 'structural', 0.80), + ("first 1000 characters", 'first_chunk', 0.90), + ("content fingerprints", 'fingerprint', 0.85), + ("word frequency patterns", 'word_freq', 0.75) + ] + + for level_name, hash_type, confidence in duplicate_detection_levels: + log(f"π Checking {level_name}...") + for hash_value, files in content_hashes[hash_type].items(): + if len(files) > 1: + for i in range(len(files)): + for j in range(i + 1, len(files)): + merge_duplicate_groups(duplicate_groups, + files[i]['filename'], + files[j]['filename']) + duplicate_confidence[(files[i]['filename'], files[j]['filename'])] = max( + duplicate_confidence[(files[i]['filename'], files[j]['filename'])], + confidence + ) + log(f" ββ Found {len(files)} files with identical {level_name}") + + # 2. Enhanced duplicate detection for different naming formats + log("π Checking for same chapters with different naming...") + enhance_duplicate_detection(results, duplicate_groups, duplicate_confidence, config, log, should_stop) + + # 3. MinHash-based detection (if available) + if lsh: + log("π Performing MinHash similarity detection...") + for result in results: + if result['filename'] in minhashes: + candidates = lsh.query(minhashes[result['filename']]) + for candidate in candidates: + if candidate != result['filename']: + # Calculate exact Jaccard similarity + jaccard = minhashes[result['filename']].jaccard(minhashes[candidate]) + if jaccard >= config.get_threshold('minhash_threshold'): + merge_duplicate_groups(duplicate_groups, result['filename'], candidate) + duplicate_confidence[(result['filename'], candidate)] = jaccard + + # 4. Semantic similarity check - OPTIMIZED + log("π Checking semantic similarity...") + semantic_threshold = config.get_threshold('semantic') + + # Use MinHash candidates for semantic checking if available + if lsh and config.mode != 'ai-hunter': + log("π Using MinHash optimization for faster semantic checking...") + checked_count = 0 + + # For non-AI Hunter modes, use MinHash to limit comparisons + for result in results: + if should_stop(): + log("β Semantic check interrupted by user.") + break + + checked_count += 1 + if checked_count % 10 == 0: + log(f" π MinHash semantic check: {checked_count}/{len(results)} files processed...") + + if result['filename'] in minhashes: + candidates = lsh.query(minhashes[result['filename']]) + for candidate_filename in candidates: + if candidate_filename == result['filename']: + continue + + # Find the candidate result + candidate_result = next((r for r in results if r['filename'] == candidate_filename), None) + if not candidate_result: + continue + + # Skip if already in same group + if (result['filename'] in duplicate_groups and + candidate_filename in duplicate_groups and + duplicate_groups[result['filename']] == duplicate_groups[candidate_filename]): + continue + + sem_sim = calculate_semantic_similarity(result['semantic_sig'], + candidate_result['semantic_sig']) + if sem_sim >= semantic_threshold: + struct_sim = calculate_structural_similarity(result['structural_sig'], + candidate_result['structural_sig']) + + if struct_sim >= config.get_threshold('structural'): + merge_duplicate_groups(duplicate_groups, + result['filename'], + candidate_filename) + confidence = (sem_sim + struct_sim) / 2 + duplicate_confidence[(result['filename'], candidate_filename)] = confidence + log(f" ββ Semantic match: {result['filename']} β {candidate_filename} " + f"(sem: {int(sem_sim*100)}%, struct: {int(struct_sim*100)}%)") + + # AI Hunter mode or fallback: check all pairs + # Skip AI Hunter in quick scan mode + if config.mode == 'quick-scan': + log(" β‘ Skipping AI Hunter checks for quick scan mode") + else: + # AI Hunter mode or fallback: check all pairs + if config.mode == 'ai-hunter' or not lsh: + if config.mode == 'ai-hunter': + log("π€ AI Hunter mode: Enhanced semantic and structural checking active") + log(" β οΈ This will check ALL file pairs - may take several minutes for large datasets") + + total_comparisons = (len(results) * (len(results) - 1)) // 2 + log(f" [DEBUG] Total comparisons to perform: {total_comparisons:,}") + + ai_start_time = time.time() # Use local timer for AI Hunter + + # Initialize last_progress HERE for AI Hunter mode + last_progress = 0 # ADD THIS LINE + + # Use parallel processing for AI Hunter + comparisons_done = parallel_ai_hunter_check(results, duplicate_groups, duplicate_confidence, + config, log, should_stop) + + # Log AI Hunter completion stats + ai_time = time.time() - ai_start_time + log(f" [DEBUG] AI Hunter took {ai_time:.2f} seconds") + if comparisons_done and comparisons_done > 0: + log(f" [DEBUG] Comparisons/second: {int(comparisons_done/max(ai_time, 1)):,}") + + # AI HUNTER IS DONE - DO NOT CONTINUE TO SEQUENTIAL CODE + + else: + # Keep the original sequential code for when there's no LSH and not in AI Hunter mode + log("β οΈ No MinHash index available - checking all pairs (slower)") + + total_comparisons = (len(results) * (len(results) - 1)) // 2 + comparisons_done = 0 + last_progress = 0 # This is already here for sequential mode + ai_start_time = time.time() # Use local timer + + # MOVE ALL THE SEQUENTIAL CODE HERE - INDENTED UNDER THIS ELSE BLOCK + + # Create cached AI Hunter comparison + @lru_cache(maxsize=10000) + def ai_hunter_check_cached(idx1, idx2): + """Cached AI Hunter check""" + sem_sim = calculate_semantic_similarity(results[idx1]['semantic_sig'], + results[idx2]['semantic_sig']) + struct_sim = calculate_structural_similarity(results[idx1]['structural_sig'], + results[idx2]['structural_sig']) + + # Quick text check + hash1 = text_hashes[idx1]['hash_2k'] + hash2 = text_hashes[idx2]['hash_2k'] + if hash1 and hash2: + if hash1 > hash2: + hash1, hash2 = hash2, hash1 + text_sim = compare_texts_cached(hash1, hash2, 2000) + else: + text_sim = 0.0 + + return sem_sim, struct_sim, text_sim + + # Check EVERY pair of files + for i in range(len(results)): + if should_stop(): + log("β Semantic check interrupted by user.") + break + + for j in range(i + 1, len(results)): + comparisons_done += 1 + + # Show progress every 5% + progress = int((comparisons_done / total_comparisons) * 100) + if progress >= last_progress + 5: + elapsed = time.time() - ai_start_time + if elapsed > 0 and comparisons_done > 0: + rate = comparisons_done / elapsed + remaining = (total_comparisons - comparisons_done) / rate + log(f" π AI Hunter progress: {comparisons_done}/{total_comparisons} ({progress}%) - ~{int(remaining)}s remaining") + else: + log(f" π AI Hunter progress: {comparisons_done}/{total_comparisons} ({progress}%)") + last_progress = progress + + # Skip if already in same group + if (results[i]['filename'] in duplicate_groups and + results[j]['filename'] in duplicate_groups and + duplicate_groups[results[i]['filename']] == duplicate_groups[results[j]['filename']]): + continue + + # Get cached comparison results + sem_sim, struct_sim, text_sim = ai_hunter_check_cached(i, j) + + # For AI Hunter, use a combination approach + if config.mode == 'ai-hunter': + # High semantic + high structural = likely same content + if sem_sim >= semantic_threshold and struct_sim >= config.get_threshold('structural'): + # If text similarity is low but semantic/structural is high, it's likely a retranslation + if text_sim < 0.6: # Different enough text + log(f" π― AI Hunter: Found potential retranslation") + log(f" Files: {results[i]['filename']} β {results[j]['filename']}") + log(f" Text similarity: {int(text_sim*100)}% (low)") + log(f" Semantic similarity: {int(sem_sim*100)}% (high)") + log(f" Structural similarity: {int(struct_sim*100)}% (high)") + + merge_duplicate_groups(duplicate_groups, + results[i]['filename'], + results[j]['filename']) + confidence = (sem_sim + struct_sim) / 2 + duplicate_confidence[(results[i]['filename'], results[j]['filename'])] = confidence + log(f" ββ π€ Flagged as AI retranslation variant (confidence: {int(confidence*100)}%)") + else: + # Normal semantic checking + if sem_sim >= semantic_threshold and struct_sim >= config.get_threshold('structural'): + merge_duplicate_groups(duplicate_groups, + results[i]['filename'], + results[j]['filename']) + confidence = (sem_sim + struct_sim) / 2 + duplicate_confidence[(results[i]['filename'], results[j]['filename'])] = confidence + log(f" ββ Semantic match: {results[i]['filename']} β {results[j]['filename']} " + f"(sem: {int(sem_sim*100)}%, struct: {int(struct_sim*100)}%)") + + # Clear local cache + ai_hunter_check_cached.cache_clear() + + # THIS CODE SHOULD BE OUTSIDE ALL THE IF/ELSE BLOCKS - IT RUNS AFTER DUPLICATE DETECTION + # 5. Deep similarity check (content-based) - Now uses cached function + if config.mode != 'quick-scan': + perform_deep_similarity_check(results, duplicate_groups, duplicate_confidence, + config.get_threshold('similarity'), log, should_stop) + else: + log(" β‘ Skipping deep similarity check for quick scan mode") + + # 6. Consecutive chapter check with fuzzy matching - SKIP IN QUICK SCAN + if config.mode != 'quick-scan': + check_consecutive_chapters(results, duplicate_groups, duplicate_confidence, config, log, should_stop) + + # 7. Split chapter detection + split_candidates = detect_split_chapters(results) + if split_candidates: + log(f"π Found {len(split_candidates)} potential split chapters") + check_split_chapters(split_candidates, results, duplicate_groups, duplicate_confidence, log, should_stop) + + # 8. Specific pattern detection + check_specific_patterns(results, duplicate_groups, duplicate_confidence, log, should_stop) + + # Clear local caches + compare_texts_cached.cache_clear() + + # Summary of findings + unique_groups = len(set(duplicate_groups.values())) if duplicate_groups else 0 + files_with_duplicates = len(duplicate_groups) + + if files_with_duplicates > 0: + log(f"\nπ Duplicate Detection Summary:") + log(f" Found {files_with_duplicates} files with duplicates") + log(f" Grouped into {unique_groups} duplicate groups") + else: + log(f"\nβ No duplicates found among {len(results)} files") + + return duplicate_groups, near_duplicate_groups, duplicate_confidence + +def process_deep_similarity_batch(args): + """Process a batch of deep similarity comparisons with enhanced error handling""" + try: + batch, data = args + batch_results = [] + + text_samples = data['text_samples'] + threshold = data['threshold'] + + # Import what we need inside the worker with error handling + try: + from difflib import SequenceMatcher + except ImportError as e: + return [{'error': f'Import error in worker: {e}'}] + + # Local cache for this worker process + similarity_cache = {} + semantic_cache = {} + + for i, j, filename_i, filename_j in batch: + try: + # Get text samples + sample_i = text_samples.get(i) + sample_j = text_samples.get(j) + + if not sample_i or not sample_j: + continue + + # Use hashes for similarity check with caching + hash1 = sample_i['hash_5k'] + hash2 = sample_j['hash_5k'] + + # Create cache key (ensure consistent ordering) + cache_key = (min(hash1, hash2), max(hash1, hash2)) + + # Check cache first + if cache_key in similarity_cache: + similarity = similarity_cache[cache_key] + else: + # Check if hashes are identical + if hash1 == hash2: + similarity = 1.0 + else: + # Calculate text similarity + text1 = sample_i['sample_5k'] + text2 = sample_j['sample_5k'] + similarity = calculate_similarity_ratio(text1, text2) + + # Cache the result + similarity_cache[cache_key] = similarity + + if similarity >= threshold: + batch_results.append({ + 'filename1': filename_i, + 'filename2': filename_j, + 'similarity': similarity, + 'is_variant': False, + 'semantic_sim': None + }) + # Check for translation variants if similarity is moderate + elif 0.5 <= similarity < threshold: + # Check semantic similarity with caching + hash1_10k = sample_i['hash_10k'] + hash2_10k = sample_j['hash_10k'] + + # Create semantic cache key + sem_cache_key = (min(hash1_10k, hash2_10k), max(hash1_10k, hash2_10k)) + + if sem_cache_key in semantic_cache: + semantic_sim = semantic_cache[sem_cache_key] + else: + if hash1_10k == hash2_10k: + semantic_sim = 1.0 + else: + text1_10k = sample_i['sample_10k'] + text2_10k = sample_j['sample_10k'] + semantic_sim = calculate_semantic_fingerprint_similarity(text1_10k, text2_10k) + + # Cache the result + semantic_cache[sem_cache_key] = semantic_sim + + if semantic_sim >= 0.75: # High semantic similarity threshold + combined_score = (similarity * 0.4 + semantic_sim * 0.6) + + if combined_score >= threshold: + batch_results.append({ + 'filename1': filename_i, + 'filename2': filename_j, + 'similarity': combined_score, + 'is_variant': True, + 'semantic_sim': semantic_sim, + 'base_sim': similarity + }) + + except Exception as e: + # Log individual comparison error but continue processing + import traceback + batch_results.append({ + 'error': f'Error comparing {filename_i} vs {filename_j}: {str(e)}\n{traceback.format_exc()[:500]}' + }) + continue + + return batch_results + + except Exception as e: + # Return error information for debugging + import traceback + return [{'error': f'{type(e).__name__}: {str(e)}\nTraceback:\n{traceback.format_exc()}'}] + + +def perform_deep_similarity_check(results, duplicate_groups, duplicate_confidence, + threshold, log, should_stop): + """Perform deep similarity analysis - PROCESSPOOLEXECUTOR VERSION with fallback""" + + log(f"π Deep content similarity analysis (threshold: {int(threshold*100)}%)...") + + # Pre-cache text samples for all results + text_samples = {} + for idx, result in enumerate(results): + text = result.get('raw_text', '') + if len(text) >= 500: + text_samples[idx] = { + 'sample_5k': text[:5000], + 'sample_10k': text[:10000], + 'hash_5k': hashlib.md5(text[:5000].encode()).hexdigest(), + 'hash_10k': hashlib.md5(text[:10000].encode()).hexdigest() + } + + # Determine number of workers + cpu_count = multiprocessing.cpu_count() + max_workers_config = 0 + + try: + config_path = os.path.join(os.path.dirname(__file__), 'config.json') + if os.path.exists(config_path): + with open(config_path, 'r', encoding='utf-8') as f: + full_config = json.load(f) + # Check multiple possible config locations + qa_config = full_config.get('qa_scanner_config', {}) + deep_check_config = full_config.get('deep_check_config', {}) + ai_hunter_config = full_config.get('ai_hunter_config', {}) + + # Priority: deep_check_config > qa_scanner_config > ai_hunter_config + max_workers_config = deep_check_config.get('max_workers', + qa_config.get('max_workers', + ai_hunter_config.get('ai_hunter_max_workers', 1))) + except: + max_workers_config = 0 + + # Determine if we should use parallel processing + use_parallel = True + parallel_error = None + + if max_workers_config == 1: + use_parallel = False + log(" π Using sequential processing (configured for 1 worker)") + elif max_workers_config > 0: + max_workers = min(max_workers_config, cpu_count) + else: + max_workers = cpu_count + + # Create comparison tasks with smart filtering + comparison_tasks = [] + checked_pairs = set() + + for i in range(len(results)): + for j in range(i + 1, len(results)): + # Skip if not in text_samples (too short) + if i not in text_samples or j not in text_samples: + continue + + pair = tuple(sorted([results[i]['filename'], results[j]['filename']])) + if pair in checked_pairs: + continue + checked_pairs.add(pair) + + # Skip if already in same group + if (results[i]['filename'] in duplicate_groups and + results[j]['filename'] in duplicate_groups and + duplicate_groups[results[i]['filename']] == duplicate_groups[results[j]['filename']]): + continue + + comparison_tasks.append((i, j, results[i]['filename'], results[j]['filename'])) + + total_comparisons = len(comparison_tasks) + log(f" π Created {total_comparisons:,} comparison tasks") + + if total_comparisons == 0: + log(" β No comparisons needed!") + return + + # Try parallel processing first + if use_parallel: + log("β‘ PROCESSPOOLEXECUTOR ENABLED - MAXIMUM PERFORMANCE!") + if max_workers_config > 0: + log(f" π₯οΈ Using {max_workers} parallel processes (configured limit)") + else: + log(f" π Using ALL {max_workers} CPU cores - MAXIMUM PERFORMANCE!") + if cpu_count > 8: + log(f" π‘ Tip: You can limit CPU cores in QA scanner settings") + + # Progress tracking + comparisons_done = 0 + last_progress = 0 + start_time = time.time() + found_duplicates = [] + + # Prepare data for workers + worker_data = { + 'text_samples': text_samples, + 'threshold': threshold + } + + # Optimal batch size for ProcessPoolExecutor + optimal_batch_size = max(1000, total_comparisons // (max_workers * 5)) + optimal_batch_size = min(optimal_batch_size, 10000) + + batches = [] + for i in range(0, len(comparison_tasks), optimal_batch_size): + batch = comparison_tasks[i:i + optimal_batch_size] + batches.append(batch) + + log(f" π¦ Split into {len(batches)} batches of ~{optimal_batch_size} comparisons each") + + # Prepare batch arguments + batch_args = [(batch, worker_data) for batch in batches] + + try: + # Process with ProcessPoolExecutor + with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: + # Submit all batches + futures = [] + for args in batch_args: + if should_stop(): + log("β Deep similarity check interrupted by user.") + executor.shutdown(wait=True) + return + + future = executor.submit(process_deep_similarity_batch, args) + futures.append(future) + + # Process results as they complete + for completed_future in concurrent.futures.as_completed(futures): + if should_stop(): + log("β Deep similarity check interrupted by user.") + executor.shutdown(wait=True) + return + + try: + # NO TIMEOUT - let it run as long as needed + batch_results = completed_future.result() + + # Check for worker errors in results + if batch_results and isinstance(batch_results, list): + # Check if first result contains an error + if batch_results and isinstance(batch_results[0], dict) and 'error' in batch_results[0]: + error_msg = batch_results[0]['error'] + log(f" β οΈ Worker error detected: {error_msg}") + raise Exception(f"Worker error: {error_msg}") + + # Batch all updates + updates = [] + for result in batch_results: + if 'error' not in result: # Skip error entries + updates.append(( + result['filename1'], + result['filename2'], + result + )) + + # Apply all updates in one lock + if updates: + with merge_lock: + for file1, file2, result in updates: + pair = tuple(sorted([file1, file2])) + + merge_duplicate_groups(duplicate_groups, file1, file2) + duplicate_confidence[pair] = max( + duplicate_confidence.get(pair, 0), + result['similarity'] + ) + + # Store messages for logging + if result.get('is_variant', False): + msg = (f" ββ Translation variant detected: {file1} β {file2} " + f"(base: {int(result.get('base_sim', 0)*100)}%, " + f"semantic: {int(result['semantic_sim']*100)}%, " + f"combined: {int(result['similarity']*100)}%)") + else: + msg = (f" ββ Content similarity: {file1} β {file2} " + f"({int(result['similarity']*100)}%)") + + found_duplicates.append(msg) + + # Update progress + comparisons_done += optimal_batch_size + if comparisons_done > total_comparisons: + comparisons_done = total_comparisons + + progress = int((comparisons_done / total_comparisons) * 100) + + # Update every 10% for less overhead + if progress >= last_progress + 10 or progress == 100: + elapsed = time.time() - start_time + rate = comparisons_done / elapsed if elapsed > 0 else 0 + remaining = (total_comparisons - comparisons_done) / rate if rate > 0 else 0 + + log(f" π Deep check progress: {comparisons_done:,}/{total_comparisons:,} " + f"({progress}%) - ~{int(remaining)}s remaining - " + f"Speed: {int(rate):,} comparisons/sec") + + # Log some found duplicates + for dup_msg in found_duplicates[:5]: + log(dup_msg) + found_duplicates = found_duplicates[5:] + + last_progress = progress + + except Exception as e: + log(f" β οΈ Error processing batch: {type(e).__name__}: {str(e)[:200]}") + import traceback + log(f" Debug trace: {traceback.format_exc()[:500]}") + parallel_error = f"{type(e).__name__}: {str(e)[:100]}" + use_parallel = False + executor.shutdown(wait=False) + break + + # If we completed successfully + if use_parallel: + # Final summary + elapsed = time.time() - start_time + log(f"β Deep similarity check complete! Processed {total_comparisons:,} comparisons in {elapsed:.1f}s") + log(f" β‘ Speed: {int(total_comparisons/elapsed):,} comparisons/sec") + log(f" π ProcessPoolExecutor: ENABLED") + + # Log remaining duplicates + for dup_msg in found_duplicates[-10:]: + log(dup_msg) + return # Success - exit function + + except Exception as e: + log(f" β οΈ Parallel processing failed: {type(e).__name__}: {str(e)[:200]}") + parallel_error = f"{type(e).__name__}: {str(e)[:100]}" + use_parallel = False + + # Fallback to sequential processing + if not use_parallel: + log(f"\n π FALLBACK: Using sequential processing") + if parallel_error: + log(f" Reason: {parallel_error}") + log(f" This will be slower but more reliable") + + # Reset progress tracking for sequential mode + comparisons_done = 0 + last_progress = 0 + start_time = time.time() + found_duplicates = [] + + # Import what we need for sequential processing + from difflib import SequenceMatcher + + for idx, task in enumerate(comparison_tasks): + if should_stop(): + log("β Deep similarity check interrupted by user.") + return + + i, j, filename_i, filename_j = task + comparisons_done += 1 + + # Show progress every 5% or every 100 comparisons (whichever is less frequent) + progress = int((comparisons_done / total_comparisons) * 100) + if (comparisons_done % max(100, total_comparisons // 20) == 0 or + comparisons_done == total_comparisons): + if progress >= last_progress + 5 or progress == 100: + elapsed = time.time() - start_time + rate = comparisons_done / elapsed if elapsed > 0 else 0 + remaining = (total_comparisons - comparisons_done) / rate if rate > 0 else 0 + + log(f" π Sequential progress: {comparisons_done:,}/{total_comparisons:,} " + f"({progress}%) - ~{int(remaining)}s remaining - " + f"Speed: {int(rate):,} comparisons/sec") + + # Log found duplicates + for dup_msg in found_duplicates[:3]: + log(dup_msg) + found_duplicates = found_duplicates[3:] + + last_progress = progress + + # Get text samples + sample_i = text_samples.get(i) + sample_j = text_samples.get(j) + + if not sample_i or not sample_j: + continue + + # Calculate similarity + if sample_i['hash_5k'] == sample_j['hash_5k']: + similarity = 1.0 + else: + text1 = sample_i['sample_5k'] + text2 = sample_j['sample_5k'] + similarity = calculate_similarity_ratio(text1, text2) + + if similarity >= threshold: + merge_duplicate_groups(duplicate_groups, filename_i, filename_j) + pair = tuple(sorted([filename_i, filename_j])) + duplicate_confidence[pair] = max( + duplicate_confidence.get(pair, 0), + similarity + ) + msg = f" ββ Content similarity: {filename_i} β {filename_j} ({int(similarity*100)}%)" + found_duplicates.append(msg) + + elif 0.5 <= similarity < threshold: + # Check semantic similarity for translation variants + text1_10k = sample_i['sample_10k'] + text2_10k = sample_j['sample_10k'] + + if sample_i['hash_10k'] == sample_j['hash_10k']: + semantic_sim = 1.0 + else: + semantic_sim = calculate_semantic_fingerprint_similarity(text1_10k, text2_10k) + + if semantic_sim >= 0.75: + combined_score = (similarity * 0.4 + semantic_sim * 0.6) + + if combined_score >= threshold: + merge_duplicate_groups(duplicate_groups, filename_i, filename_j) + pair = tuple(sorted([filename_i, filename_j])) + duplicate_confidence[pair] = max( + duplicate_confidence.get(pair, 0), + combined_score + ) + msg = (f" ββ Translation variant detected: {filename_i} β {filename_j} " + f"(base: {int(similarity*100)}%, semantic: {int(semantic_sim*100)}%, " + f"combined: {int(combined_score*100)}%)") + found_duplicates.append(msg) + + # Final summary for sequential mode + elapsed = time.time() - start_time + log(f"β Deep similarity check complete! Processed {total_comparisons:,} comparisons in {elapsed:.1f}s") + if elapsed > 0: + log(f" Speed: {int(total_comparisons/elapsed):,} comparisons/sec") + log(f" Mode: Sequential (fallback)") + + # Log remaining duplicates + for dup_msg in found_duplicates[-10:]: + log(dup_msg) + +def check_consecutive_chapters(results, duplicate_groups, duplicate_confidence, config, log, should_stop=None): + """Check for consecutive chapters with same title using fuzzy matching""" + log("π Checking consecutive same-titled chapters...") + + # Check for stop early + if should_stop and should_stop(): + log("β Consecutive chapter check interrupted by user.") + return + + # Extract chapter titles + for result in results: + result['chapter_title'] = extract_chapter_title(result['raw_text']) + + # Sort by chapter number + chapter_sorted = [r for r in results if r['chapter_num'] is not None] + chapter_sorted.sort(key=lambda x: x['chapter_num']) + + consecutive_threshold = config.get_threshold('consecutive_chapters') + + for i in range(len(chapter_sorted) - 1): + if should_stop and should_stop(): + log("β Consecutive chapter check interrupted by user.") + return + + current = chapter_sorted[i] + + for j in range(i + 1, min(i + consecutive_threshold + 1, len(chapter_sorted))): + next_chapter = chapter_sorted[j] + + # Check if chapter numbers might be the same (fuzzy match) + if fuzzy_match_chapter_numbers(current['raw_text'], next_chapter['raw_text'], + current['chapter_num'], next_chapter['chapter_num']): + # Compare content + similarity = calculate_similarity_ratio(current['raw_text'], next_chapter['raw_text']) + if similarity >= config.get_threshold('similarity'): + merge_duplicate_groups(duplicate_groups, current['filename'], next_chapter['filename']) + pair = tuple(sorted([current['filename'], next_chapter['filename']])) + duplicate_confidence[pair] = similarity + log(f" ββ Fuzzy chapter match: {current['filename']} β {next_chapter['filename']} ({int(similarity*100)}%)") + continue + + # Check same title + if (current.get('chapter_title') and current['chapter_title'] == next_chapter.get('chapter_title') and + abs(current['chapter_num'] - next_chapter['chapter_num']) <= consecutive_threshold): + + # Compare content without chapter headers + text1 = re.sub(r'Chapter\s+\d+\s*:?\s*', '', current['raw_text'][:2000], flags=re.IGNORECASE) + text2 = re.sub(r'Chapter\s+\d+\s*:?\s*', '', next_chapter['raw_text'][:2000], flags=re.IGNORECASE) + + similarity = calculate_similarity_ratio(text1, text2) + + if similarity >= config.get_threshold('similarity') * 0.9: # Slightly lower threshold for same title + merge_duplicate_groups(duplicate_groups, current['filename'], next_chapter['filename']) + pair = tuple(sorted([current['filename'], next_chapter['filename']])) + duplicate_confidence[pair] = similarity + log(f" ββ Same-titled chapters {current['chapter_num']} & {next_chapter['chapter_num']} " + f"({int(similarity*100)}% similar)") + + +def check_split_chapters(split_candidates, results, duplicate_groups, duplicate_confidence, log, should_stop=None): + """Check if split chapters are parts of the same content + Enhanced to reduce false positives from intentional author formatting + """ + for i, candidate in enumerate(split_candidates): + if should_stop and should_stop(): + log("β Split chapter check interrupted by user.") + return + + idx = candidate['index'] + indicators = candidate['indicators'] + + # Check next few files + for j in range(1, 4): # Check up to 3 files ahead + if idx + j < len(results): + next_result = results[idx + j] + next_text = next_result.get('raw_text', '') + + # Skip if next file is empty + if not next_text.strip(): + continue + + # Extract chapter numbers if present + current_chapter_num = results[idx].get('chapter_num') + next_chapter_num = next_result.get('chapter_num') + + # Strong indicator: same chapter number + same_chapter_number = (current_chapter_num is not None and + next_chapter_num is not None and + current_chapter_num == next_chapter_num) + + # Check file naming pattern similarity + current_filename = results[idx]['filename'] + next_filename = next_result['filename'] + + # Look for systematic naming (e.g., file_1.html, file_2.html) + naming_pattern_match = False + if re.sub(r'\d+', 'X', current_filename) == re.sub(r'\d+', 'X', next_filename): + # Files have same pattern with different numbers + naming_pattern_match = True + + # Check if content flows naturally + should_check_flow = False + confidence_score = 0.0 + + if indicators['is_systematic_split'] or naming_pattern_match: + # Strong file naming evidence + should_check_flow = True + confidence_score = 0.85 + elif same_chapter_number: + # Same chapter number is strong evidence + should_check_flow = True + confidence_score = 0.9 + elif indicators['ends_mid']: + # Only check flow if current ends mid-sentence + next_text_stripped = next_text.strip() + if next_text_stripped: + # Check if next starts without capital (excluding common transition words) + first_line = next_text_stripped.split('\n')[0].strip() + if first_line and not re.match(r'^["γγ\(\[]', first_line): + first_word = first_line.split()[0] if first_line.split() else '' + transition_words = ['meanwhile', 'however', 'suddenly', 'later', + 'earlier', 'elsewhere', 'afterward', 'afterwards', 'then'] + if (first_word.lower() not in transition_words and + first_line[0].islower()): + should_check_flow = True + confidence_score = 0.75 + + if should_check_flow: + # Get text samples for flow checking + text1_end = results[idx].get('raw_text', '')[-500:] + text2_start = next_text[:500] + + # Remove any scene break markers for flow check + scene_breaks = [r'[\*\s]{3,}', r'[ββοΌβ\-]{3,}', r'[_]{3,}', + r'[~ο½]{3,}', r'[=]{3,}', r'[\#]{3,}'] + for pattern in scene_breaks: + text1_end = re.sub(pattern, '', text1_end) + text2_start = re.sub(pattern, '', text2_start) + + # Check if content flows + combined = text1_end.strip() + " " + text2_start.strip() + + # Count sentence endings in combined text + sentence_endings = len(re.findall(r'[.!?γοΌοΌ]', combined)) + + # Check for incomplete dialogue + incomplete_dialogue = (text1_end.count('"') + text2_start.count('"')) % 2 != 0 + incomplete_dialogue_jp = (text1_end.count('γ') + text2_start.count('γ') != + text1_end.count('γ') + text2_start.count('γ')) + + # Determine if this is a real split + is_real_split = False + + if sentence_endings < 2: # Very few sentence endings suggests continuous text + is_real_split = True + confidence_score = max(confidence_score, 0.85) + elif incomplete_dialogue or incomplete_dialogue_jp: + is_real_split = True + confidence_score = max(confidence_score, 0.8) + elif same_chapter_number or indicators['is_systematic_split']: + # With strong other evidence, be more lenient + is_real_split = True + + if is_real_split: + merge_duplicate_groups(duplicate_groups, current_filename, next_filename) + pair = tuple(sorted([current_filename, next_filename])) + duplicate_confidence[pair] = confidence_score + + reason = [] + if same_chapter_number: + reason.append(f"same chapter #{current_chapter_num}") + if indicators['is_systematic_split']: + reason.append("systematic file naming") + if naming_pattern_match: + reason.append("matching name pattern") + if sentence_endings < 2: + reason.append("continuous text flow") + if incomplete_dialogue or incomplete_dialogue_jp: + reason.append("incomplete dialogue") + + reason_str = ", ".join(reason) if reason else "content flow analysis" + log(f" ββ Split chapter detected ({reason_str}): {current_filename} β {next_filename} " + f"(confidence: {int(confidence_score*100)}%)") + +def check_specific_patterns(results, duplicate_groups, duplicate_confidence, log, should_stop=None): + """Check for specific known duplicate patterns""" + log("π Checking for known duplicate patterns...") + + if should_stop and should_stop(): + log("β Pattern check interrupted by user.") + return + + # Known patterns that indicate duplicates + patterns = { + 'chapel_scene': r"under the pretense of offering a prayer.*?visited the chapel.*?hiding while holding.*?breath.*?watching the scene", + 'battle_scene': r"sword.*?clash.*?sparks.*?flew.*?metal.*?rang", + 'magic_spell': r"mana.*?gathered.*?spell.*?formation.*?glowed", + } + + pattern_matches = defaultdict(list) + + for i, result in enumerate(results): + text_sample = result.get('preview', '') + result.get('raw_text', '')[:2000] + + for pattern_name, pattern in patterns.items(): + if re.search(pattern, text_sample, re.IGNORECASE | re.DOTALL): + pattern_matches[pattern_name].append(i) + + # Group files with same patterns + for pattern_name, indices in pattern_matches.items(): + if should_stop and should_stop(): + log("β Pattern check interrupted by user.") + return + + if len(indices) > 1: + log(f" ββ Found {len(indices)} files with '{pattern_name}' pattern") + + for i in range(len(indices)): + for j in range(i + 1, len(indices)): + idx1, idx2 = indices[i], indices[j] + + # Verify with content similarity + similarity = calculate_similarity_ratio( + results[idx1].get('raw_text', '')[:3000], + results[idx2].get('raw_text', '')[:3000] + ) + + if similarity > 0.7: # Lower threshold for known patterns + merge_duplicate_groups(duplicate_groups, + results[idx1]['filename'], + results[idx2]['filename']) + pair = tuple(sorted([results[idx1]['filename'], results[idx2]['filename']])) + duplicate_confidence[pair] = similarity + log(f" Pattern match confirmed: {results[idx1]['filename']} β {results[idx2]['filename']}") + +def generate_reports(results, folder_path, duplicate_confidence, log=print, qa_settings=None): + """Generate output reports with enhanced duplicate information based on settings""" + if qa_settings is None: + qa_settings = {'report_format': 'detailed', 'auto_save_report': True} + + report_format = qa_settings.get('report_format', 'detailed') + auto_save = qa_settings.get('auto_save_report', True) + + # Create output directory + output_dir = os.path.basename(folder_path.rstrip('/\\')) + "_Scan Report" + output_path = os.path.join(folder_path, output_dir) + os.makedirs(output_path, exist_ok=True) + + # Prepare confidence scores for report + for result in results: + result['duplicate_confidence'] = 0 + for pair, confidence in duplicate_confidence.items(): + if result['filename'] in pair: + result['duplicate_confidence'] = max(result['duplicate_confidence'], confidence) + + # Common function to save all reports + def save_all_reports(): + # Save JSON report + with open(os.path.join(output_path, "validation_results.json"), "w", encoding="utf-8") as jf: + json.dump(results, jf, indent=2, ensure_ascii=False) + + # Save CSV report + with open(os.path.join(output_path, "validation_results.csv"), "w", encoding="utf-8", newline="") as cf: + writer = csv.DictWriter(cf, fieldnames=["file_index", "filename", "score", "issues", "duplicate_confidence"]) + writer.writeheader() + for row in results: + writer.writerow({ + "file_index": row["file_index"], + "filename": row["filename"], + "score": row["score"], + "issues": "; ".join(row["issues"]), + "duplicate_confidence": f"{row.get('duplicate_confidence', 0):.2f}" + }) + + # Generate HTML report + generate_html_report(results, output_path, duplicate_confidence) + + # Generate duplicate groups summary + generate_duplicate_summary(results, output_path, duplicate_confidence) + + # Generate reports based on format setting + if report_format == 'summary': + # Summary format - only key statistics + log(f"\nπ QA Scan Summary:") + log(f" Total files scanned: {len(results)}") + + issue_count = sum(1 for r in results if r['issues']) + log(f" Files with issues: {issue_count}") + + # Count by issue type + issue_types = {} + for result in results: + for issue in result['issues']: + issue_type = issue.split('_')[0] + issue_types[issue_type] = issue_types.get(issue_type, 0) + 1 + + log(f"\n Issues by type:") + for issue_type, count in sorted(issue_types.items(), key=lambda x: x[1], reverse=True): + log(f" - {issue_type}: {count}") + + # Save minimal summary file if auto-save enabled + if auto_save: + summary_file = os.path.join(output_path, "scan_summary.txt") + with open(summary_file, 'w', encoding='utf-8') as f: + f.write(f"QA Scan Summary\n") + f.write(f"===============\n\n") + f.write(f"Total files scanned: {len(results)}\n") + f.write(f"Files with issues: {issue_count}\n\n") + f.write(f"Issues by type:\n") + for issue_type, count in sorted(issue_types.items(), key=lambda x: x[1], reverse=True): + f.write(f" - {issue_type}: {count}\n") + log(f"\nπ Summary saved to: {output_path}") + + elif report_format == 'verbose': + # Verbose format - include everything including raw text samples + if auto_save: + # Save detailed JSON with all data + verbose_results = [] + for result in results.copy(): + verbose_result = result.copy() + # Include first 1000 chars of raw text in verbose mode + if 'raw_text' in result: + verbose_result['text_sample'] = result['raw_text'][:1000] + verbose_results.append(verbose_result) + + with open(os.path.join(output_path, "validation_results_verbose.json"), "w", encoding="utf-8") as jf: + json.dump(verbose_results, jf, indent=2, ensure_ascii=False) + + # Generate detailed text report + with open(os.path.join(output_path, "detailed_report.txt"), "w", encoding="utf-8") as tf: + tf.write("DETAILED QA SCAN REPORT\n") + tf.write("=" * 80 + "\n\n") + + for result in results: + tf.write(f"File: {result['filename']}\n") + tf.write(f"Chapter: {result.get('chapter_num', 'Unknown')}\n") + tf.write(f"Issues: {len(result['issues'])}\n") + if result['issues']: + for issue in result['issues']: + tf.write(f" - {issue}\n") + tf.write(f"Duplicate Confidence: {result.get('duplicate_confidence', 0):.2f}\n") + tf.write(f"Preview: {result.get('preview', '')[:200]}...\n") + tf.write("-" * 80 + "\n\n") + + # All existing reports (JSON, CSV, HTML) + save_all_reports() + + else: # detailed (default) + # Current behavior - standard reports + if auto_save: + save_all_reports() + else: + log(f"\nβ Scan complete! Reports not saved (auto-save disabled)") + + log(f"\nβ Scan complete!") + if auto_save: + log(f"π Reports saved to: {output_path}") + +def generate_duplicate_summary(results, output_path, duplicate_confidence): + """Generate a summary of duplicate groups""" + # Collect duplicate groups + groups = defaultdict(list) + for result in results: + for issue in result.get('issues', []): + if issue.startswith('DUPLICATE:'): + # Extract group info + if 'part_of_' in issue: + group_id = issue.split('part_of_')[1].split('_')[0] + groups[f"group_{group_id}"].append(result['filename']) + elif 'exact_or_near_copy_of_' in issue: + other = issue.split('exact_or_near_copy_of_')[1] + groups[f"pair_{result['filename']}_{other}"].append(result['filename']) + groups[f"pair_{result['filename']}_{other}"].append(other) + + # Create summary + summary = { + 'total_files': len(results), + 'files_with_duplicates': sum(1 for r in results if any('DUPLICATE' in i for i in r.get('issues', []))), + 'duplicate_groups': len(groups), + 'groups': {} + } + + for group_name, files in groups.items(): + unique_files = list(set(files)) + confidences = [] + for i in range(len(unique_files)): + for j in range(i + 1, len(unique_files)): + pair = tuple(sorted([unique_files[i], unique_files[j]])) + if pair in duplicate_confidence: + confidences.append(duplicate_confidence[pair]) + + summary['groups'][group_name] = { + 'files': unique_files, + 'count': len(unique_files), + 'avg_confidence': sum(confidences) / len(confidences) if confidences else 0 + } + + with open(os.path.join(output_path, "duplicate_summary.json"), "w", encoding="utf-8") as f: + json.dump(summary, f, indent=2, ensure_ascii=False) + +def generate_html_report(results, output_path, duplicate_confidence): + """Generate enhanced HTML report with duplicate confidence scores""" + issue_counts = {} + for r in results: + for issue in r['issues']: + issue_type = issue.split(':')[0] if ':' in issue else issue.split('_')[0] + issue_counts[issue_type] = issue_counts.get(issue_type, 0) + 1 + + html = f""" +
+ +Total Files Scanned: {len(results)}
+Files with Issues: {sum(1 for r in results if r['issues'])}
+Clean Files: {sum(1 for r in results if not r['issues'])}
+""" + + if issue_counts: + html += "| Index | Filename | Issues | Confidence | Preview |
|---|---|---|---|---|
| {row['file_index']} | +{link} | +{issues_str} | +{confidence_str} | +{preview_escaped} | +
contains block elements: {[el.name for el in block_elements[:3]]}") + + # Check for list items outside of lists + all_li = soup_strict.find_all('li') + for li in all_li: + parent = li.parent + if parent and parent.name not in ['ul', 'ol']: + invalid_nesting.append(f"