aamanlamba Claude commited on
Commit
b304992
Β·
1 Parent(s): 06e7156

Add Apache Atlas export, graph interactivity, and polish

Browse files

Features:
- Apache Atlas exporter for open-source data governance
- Enhanced graph visualization with click-to-zoom
- PNG/SVG download buttons for lineage graphs
- Updated README with HF username and Space URL
- 5 export formats now supported

Tests: 13 passing

πŸ€– Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (5) hide show
  1. README.md +7 -6
  2. app.py +22 -6
  3. exporters/__init__.py +2 -0
  4. exporters/atlas.py +264 -0
  5. tests/test_app.py +15 -0
README.md CHANGED
@@ -23,7 +23,7 @@ tags:
23
 
24
  **AI-powered data lineage extraction and visualization for modern data platforms**
25
 
26
- [![HuggingFace Space](https://img.shields.io/badge/πŸ€—%20Hugging%20Face-Space-blue)](https://huggingface.co/spaces/YOUR_SPACE)
27
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
28
  [![Gradio](https://img.shields.io/badge/Gradio-6.0.0-orange)](https://gradio.app)
29
 
@@ -264,14 +264,15 @@ python test_setup.py
264
  **Track**: Track 2 - MCP in Action (Productivity)
265
 
266
  **Team Members**:
267
- - [Your HuggingFace Username]
 
268
 
269
  ### Judging Criteria Alignment
270
 
271
  | Criteria | Implementation |
272
  |----------|----------------|
273
  | **UI/UX Design** | Clean, professional interface with tabs, accordions, and color-coded visualizations |
274
- | **Functionality** | Full MCP integration, multiple input formats, 4 export formats |
275
  | **Creativity** | Novel approach to data lineage visualization with AI-powered parsing |
276
  | **Documentation** | Comprehensive README, USER_GUIDE.md, inline comments |
277
  | **Real-world Impact** | Solves critical enterprise need for data governance and compliance |
@@ -288,11 +289,11 @@ python test_setup.py
288
 
289
  ## πŸ”œ Roadmap
290
 
291
- - [ ] Gradio 6 upgrade for enhanced UI components
292
  - [ ] Agentic chatbot for natural language queries
293
- - [ ] Apache Atlas export support
294
  - [ ] File upload functionality
295
- - [ ] Graph export as PNG/SVG
296
  - [ ] Batch processing API
297
  - [ ] Column-level lineage
298
 
 
23
 
24
  **AI-powered data lineage extraction and visualization for modern data platforms**
25
 
26
+ [![HuggingFace Space](https://img.shields.io/badge/πŸ€—%20Hugging%20Face-Space-blue)](https://huggingface.co/spaces/aamanlamba/Lineage-graph-accelerator)
27
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
28
  [![Gradio](https://img.shields.io/badge/Gradio-6.0.0-orange)](https://gradio.app)
29
 
 
264
  **Track**: Track 2 - MCP in Action (Productivity)
265
 
266
  **Team Members**:
267
+
268
+ - [aamanlamba](https://huggingface.co/aamanlamba)
269
 
270
  ### Judging Criteria Alignment
271
 
272
  | Criteria | Implementation |
273
  |----------|----------------|
274
  | **UI/UX Design** | Clean, professional interface with tabs, accordions, and color-coded visualizations |
275
+ | **Functionality** | Full MCP integration, multiple input formats, 5 export formats |
276
  | **Creativity** | Novel approach to data lineage visualization with AI-powered parsing |
277
  | **Documentation** | Comprehensive README, USER_GUIDE.md, inline comments |
278
  | **Real-world Impact** | Solves critical enterprise need for data governance and compliance |
 
289
 
290
  ## πŸ”œ Roadmap
291
 
292
+ - [x] Gradio 6 upgrade for enhanced UI components
293
  - [ ] Agentic chatbot for natural language queries
294
+ - [x] Apache Atlas export support
295
  - [ ] File upload functionality
296
+ - [x] Graph export as PNG/SVG
297
  - [ ] Batch processing API
298
  - [ ] Column-level lineage
299
 
app.py CHANGED
@@ -16,7 +16,7 @@ from datetime import datetime
16
  try:
17
  from exporters import (
18
  LineageGraph, LineageNode, LineageEdge,
19
- OpenLineageExporter, CollibraExporter, PurviewExporter, AlationExporter
20
  )
21
  EXPORTERS_AVAILABLE = True
22
  except ImportError:
@@ -38,7 +38,7 @@ SAMPLE_FILES = {
38
  "bigquery": "sample_bigquery.sql"
39
  }
40
 
41
- EXPORT_FORMATS = ["OpenLineage", "Collibra", "Purview", "Alation"]
42
 
43
  # Preset MCP Servers on HuggingFace that can provide metadata
44
  MCP_PRESETS = {
@@ -209,14 +209,28 @@ def render_mermaid(viz_code: str) -> str:
209
  # Create the mermaid.ink URL for SVG rendering
210
  img_url = f"https://mermaid.ink/svg/{encoded}"
211
 
 
 
 
212
  # Also create a link to the live editor for users who want to modify
213
  editor_url = f"https://mermaid.live/edit#base64:{base64.b64encode(viz_code.encode('utf-8')).decode('utf-8')}"
214
 
215
  html = f'''
216
  <div style="background: white; padding: 20px; border-radius: 8px; min-height: 200px;">
217
- <img src="{img_url}" alt="Lineage Graph" style="max-width: 100%; height: auto;" />
218
- <div style="margin-top: 10px; font-size: 12px; color: #666;">
219
- <a href="{editor_url}" target="_blank" style="color: #7c3aed;">Open in Mermaid Live Editor</a>
 
 
 
 
 
 
 
 
 
 
 
220
  </div>
221
  </div>
222
  '''
@@ -647,6 +661,8 @@ def export_lineage(metadata_text: str, source_type: str, export_format: str) ->
647
  exporter = PurviewExporter(graph)
648
  elif export_format == "Alation":
649
  exporter = AlationExporter(graph)
 
 
650
  else:
651
  return "", f"Unknown export format: {export_format}"
652
 
@@ -779,7 +795,7 @@ with gr.Blocks(
779
  |---------|-------------|
780
  | **Extract Lineage** | Parse metadata from dbt manifests, Airflow DAGs, SQL DDL, BigQuery, and custom JSON |
781
  | **Visualize** | Generate interactive Mermaid diagrams with color-coded nodes and relationship labels |
782
- | **Export** | Export to enterprise data catalogs: OpenLineage, Collibra, Microsoft Purview, Alation |
783
  | **MCP Integration** | Connect to MCP servers for AI-powered metadata extraction |
784
 
785
  ### Quick Start
 
16
  try:
17
  from exporters import (
18
  LineageGraph, LineageNode, LineageEdge,
19
+ OpenLineageExporter, CollibraExporter, PurviewExporter, AlationExporter, AtlasExporter
20
  )
21
  EXPORTERS_AVAILABLE = True
22
  except ImportError:
 
38
  "bigquery": "sample_bigquery.sql"
39
  }
40
 
41
+ EXPORT_FORMATS = ["OpenLineage", "Collibra", "Purview", "Alation", "Atlas"]
42
 
43
  # Preset MCP Servers on HuggingFace that can provide metadata
44
  MCP_PRESETS = {
 
209
  # Create the mermaid.ink URL for SVG rendering
210
  img_url = f"https://mermaid.ink/svg/{encoded}"
211
 
212
+ # PNG version for download
213
+ png_url = f"https://mermaid.ink/img/{encoded}"
214
+
215
  # Also create a link to the live editor for users who want to modify
216
  editor_url = f"https://mermaid.live/edit#base64:{base64.b64encode(viz_code.encode('utf-8')).decode('utf-8')}"
217
 
218
  html = f'''
219
  <div style="background: white; padding: 20px; border-radius: 8px; min-height: 200px;">
220
+ <div style="overflow: auto; max-height: 500px; border: 1px solid #e0e0e0; border-radius: 4px; padding: 10px;">
221
+ <img id="lineage-graph" src="{img_url}" alt="Lineage Graph" style="max-width: 100%; height: auto; cursor: zoom-in;" onclick="this.style.maxWidth = this.style.maxWidth === 'none' ? '100%' : 'none'; this.style.cursor = this.style.cursor === 'zoom-in' ? 'zoom-out' : 'zoom-in';" />
222
+ </div>
223
+ <div style="margin-top: 12px; display: flex; gap: 16px; flex-wrap: wrap; align-items: center;">
224
+ <a href="{editor_url}" target="_blank" style="color: #7c3aed; text-decoration: none; font-size: 13px;">
225
+ Edit in Mermaid Live
226
+ </a>
227
+ <a href="{png_url}" download="lineage_graph.png" style="color: #2563eb; text-decoration: none; font-size: 13px;">
228
+ Download PNG
229
+ </a>
230
+ <a href="{img_url}" download="lineage_graph.svg" style="color: #059669; text-decoration: none; font-size: 13px;">
231
+ Download SVG
232
+ </a>
233
+ <span style="color: #888; font-size: 12px; margin-left: auto;">Click graph to zoom</span>
234
  </div>
235
  </div>
236
  '''
 
661
  exporter = PurviewExporter(graph)
662
  elif export_format == "Alation":
663
  exporter = AlationExporter(graph)
664
+ elif export_format == "Atlas":
665
+ exporter = AtlasExporter(graph)
666
  else:
667
  return "", f"Unknown export format: {export_format}"
668
 
 
795
  |---------|-------------|
796
  | **Extract Lineage** | Parse metadata from dbt manifests, Airflow DAGs, SQL DDL, BigQuery, and custom JSON |
797
  | **Visualize** | Generate interactive Mermaid diagrams with color-coded nodes and relationship labels |
798
+ | **Export** | Export to enterprise data catalogs: OpenLineage, Collibra, Purview, Alation, Atlas |
799
  | **MCP Integration** | Connect to MCP servers for AI-powered metadata extraction |
800
 
801
  ### Quick Start
exporters/__init__.py CHANGED
@@ -14,6 +14,7 @@ from .openlineage import OpenLineageExporter
14
  from .collibra import CollibraExporter
15
  from .purview import PurviewExporter
16
  from .alation import AlationExporter
 
17
 
18
  __all__ = [
19
  'LineageExporter',
@@ -24,4 +25,5 @@ __all__ = [
24
  'CollibraExporter',
25
  'PurviewExporter',
26
  'AlationExporter',
 
27
  ]
 
14
  from .collibra import CollibraExporter
15
  from .purview import PurviewExporter
16
  from .alation import AlationExporter
17
+ from .atlas import AtlasExporter
18
 
19
  __all__ = [
20
  'LineageExporter',
 
25
  'CollibraExporter',
26
  'PurviewExporter',
27
  'AlationExporter',
28
+ 'AtlasExporter',
29
  ]
exporters/atlas.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Apache Atlas Exporter - Export to Apache Atlas format.
3
+
4
+ Apache Atlas is an open-source metadata management and data governance framework.
5
+ https://atlas.apache.org/
6
+ """
7
+
8
+ from typing import Dict, Any, List
9
+ from datetime import datetime
10
+ import uuid
11
+ from .base import LineageExporter, LineageGraph, LineageNode, LineageEdge
12
+
13
+
14
+ class AtlasExporter(LineageExporter):
15
+ """Export lineage to Apache Atlas format."""
16
+
17
+ def __init__(self, graph: LineageGraph, cluster_name: str = "lineage_accelerator"):
18
+ super().__init__(graph)
19
+ self.cluster_name = cluster_name
20
+
21
+ @property
22
+ def format_name(self) -> str:
23
+ return "Apache Atlas"
24
+
25
+ @property
26
+ def file_extension(self) -> str:
27
+ return ".json"
28
+
29
+ def _node_type_to_atlas_type(self, node_type: str) -> str:
30
+ """Map internal node types to Atlas type names."""
31
+ type_mapping = {
32
+ "table": "hive_table",
33
+ "view": "hive_table",
34
+ "model": "hive_table",
35
+ "source": "hive_db",
36
+ "destination": "hive_table",
37
+ "column": "hive_column",
38
+ "database": "hive_db",
39
+ "schema": "hive_db",
40
+ "report": "Report",
41
+ "dimension": "hive_table",
42
+ "fact": "hive_table",
43
+ "feature_set": "hive_table",
44
+ "semantic_model": "DataSet",
45
+ "external_api": "api_endpoint",
46
+ "extract": "hive_table",
47
+ "task": "Process"
48
+ }
49
+ return type_mapping.get(node_type.lower(), "DataSet")
50
+
51
+ def _generate_guid(self, identifier: str) -> str:
52
+ """Generate a deterministic GUID for an entity."""
53
+ return str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{self.cluster_name}.{identifier}"))
54
+
55
+ def _create_qualified_name(self, node: LineageNode) -> str:
56
+ """Create Atlas-style qualified name for a node."""
57
+ parts = []
58
+ if node.database:
59
+ parts.append(node.database)
60
+ if node.schema:
61
+ parts.append(node.schema)
62
+ parts.append(node.name)
63
+ parts.append(self.cluster_name)
64
+ return ".".join(parts) + f"@{self.cluster_name}"
65
+
66
+ def _create_entity(self, node: LineageNode) -> Dict[str, Any]:
67
+ """Create an Atlas entity from a node."""
68
+ qualified_name = self._create_qualified_name(node)
69
+ atlas_type = self._node_type_to_atlas_type(node.type)
70
+
71
+ entity = {
72
+ "typeName": atlas_type,
73
+ "guid": self._generate_guid(node.id),
74
+ "attributes": {
75
+ "qualifiedName": qualified_name,
76
+ "name": node.name,
77
+ "description": node.description or f"{node.type}: {node.name}",
78
+ "owner": node.owner or "lineage_accelerator",
79
+ "createTime": int(datetime.now().timestamp() * 1000),
80
+ "modifiedTime": int(datetime.now().timestamp() * 1000)
81
+ },
82
+ "status": "ACTIVE"
83
+ }
84
+
85
+ # Add type-specific attributes
86
+ if atlas_type == "hive_table":
87
+ entity["attributes"]["tableType"] = "MANAGED_TABLE"
88
+ if node.database:
89
+ entity["attributes"]["db"] = {
90
+ "typeName": "hive_db",
91
+ "uniqueAttributes": {
92
+ "qualifiedName": f"{node.database}@{self.cluster_name}"
93
+ }
94
+ }
95
+
96
+ # Add columns if present
97
+ if node.columns:
98
+ entity["attributes"]["columns"] = [
99
+ {
100
+ "typeName": "hive_column",
101
+ "uniqueAttributes": {
102
+ "qualifiedName": f"{qualified_name}.{col.get('name')}@{self.cluster_name}"
103
+ }
104
+ }
105
+ for col in node.columns
106
+ ]
107
+
108
+ # Add classification tags
109
+ if node.tags:
110
+ entity["classifications"] = [
111
+ {"typeName": tag} for tag in node.tags
112
+ ]
113
+
114
+ # Add custom attributes
115
+ if node.metadata:
116
+ entity["attributes"]["additionalProperties"] = node.metadata
117
+
118
+ if node.category:
119
+ entity["attributes"]["dataLayer"] = node.category
120
+
121
+ return entity
122
+
123
+ def _create_column_entities(self, node: LineageNode) -> List[Dict[str, Any]]:
124
+ """Create Atlas column entities from a node's columns."""
125
+ if not node.columns:
126
+ return []
127
+
128
+ table_qualified_name = self._create_qualified_name(node)
129
+ columns = []
130
+
131
+ for idx, col in enumerate(node.columns):
132
+ col_name = col.get("name", f"column_{idx}")
133
+ col_qualified_name = f"{table_qualified_name}.{col_name}"
134
+
135
+ column_entity = {
136
+ "typeName": "hive_column",
137
+ "guid": self._generate_guid(f"{node.id}.{col_name}"),
138
+ "attributes": {
139
+ "qualifiedName": col_qualified_name,
140
+ "name": col_name,
141
+ "type": col.get("type") or col.get("data_type", "string"),
142
+ "description": col.get("description", ""),
143
+ "position": idx,
144
+ "table": {
145
+ "typeName": self._node_type_to_atlas_type(node.type),
146
+ "uniqueAttributes": {
147
+ "qualifiedName": table_qualified_name
148
+ }
149
+ }
150
+ },
151
+ "status": "ACTIVE"
152
+ }
153
+
154
+ columns.append(column_entity)
155
+
156
+ return columns
157
+
158
+ def _create_process_entity(self, edge: LineageEdge) -> Dict[str, Any]:
159
+ """Create an Atlas process entity from an edge (for lineage)."""
160
+ source_node = self.graph.get_node(edge.source)
161
+ target_node = self.graph.get_node(edge.target)
162
+
163
+ process_name = edge.job_name or f"process_{edge.source}_to_{edge.target}"
164
+ process_qualified_name = f"{process_name}@{self.cluster_name}"
165
+
166
+ # Build inputs and outputs
167
+ inputs = []
168
+ outputs = []
169
+
170
+ if source_node:
171
+ inputs.append({
172
+ "typeName": self._node_type_to_atlas_type(source_node.type),
173
+ "uniqueAttributes": {
174
+ "qualifiedName": self._create_qualified_name(source_node)
175
+ }
176
+ })
177
+
178
+ if target_node:
179
+ outputs.append({
180
+ "typeName": self._node_type_to_atlas_type(target_node.type),
181
+ "uniqueAttributes": {
182
+ "qualifiedName": self._create_qualified_name(target_node)
183
+ }
184
+ })
185
+
186
+ process = {
187
+ "typeName": "Process",
188
+ "guid": self._generate_guid(f"process.{edge.source}.{edge.target}"),
189
+ "attributes": {
190
+ "qualifiedName": process_qualified_name,
191
+ "name": process_name,
192
+ "description": edge.transformation or f"Data flow: {edge.type}",
193
+ "inputs": inputs,
194
+ "outputs": outputs,
195
+ "operationType": edge.type.upper() if edge.type else "ETL"
196
+ },
197
+ "status": "ACTIVE"
198
+ }
199
+
200
+ if edge.job_id:
201
+ process["attributes"]["jobId"] = edge.job_id
202
+
203
+ return process
204
+
205
+ def export(self) -> str:
206
+ """Export to Apache Atlas JSON format."""
207
+ return self.to_json(indent=2)
208
+
209
+ def _to_dict(self) -> Dict[str, Any]:
210
+ """Convert to Atlas bulk import dictionary."""
211
+ entities = []
212
+ referred_entities = {}
213
+
214
+ # Create database entities
215
+ databases = set()
216
+ for node in self.graph.nodes:
217
+ if node.database and node.database not in databases:
218
+ databases.add(node.database)
219
+ db_entity = {
220
+ "typeName": "hive_db",
221
+ "guid": self._generate_guid(f"db.{node.database}"),
222
+ "attributes": {
223
+ "qualifiedName": f"{node.database}@{self.cluster_name}",
224
+ "name": node.database,
225
+ "description": f"Database: {node.database}",
226
+ "clusterName": self.cluster_name
227
+ },
228
+ "status": "ACTIVE"
229
+ }
230
+ entities.append(db_entity)
231
+
232
+ # Create table/dataset entities
233
+ for node in self.graph.nodes:
234
+ entity = self._create_entity(node)
235
+ entities.append(entity)
236
+
237
+ # Create column entities
238
+ for col_entity in self._create_column_entities(node):
239
+ entities.append(col_entity)
240
+
241
+ # Create process entities for lineage
242
+ for edge in self.graph.edges:
243
+ process = self._create_process_entity(edge)
244
+ entities.append(process)
245
+
246
+ return {
247
+ "exportInfo": {
248
+ "producer": "Lineage Graph Accelerator",
249
+ "exportedAt": self.graph.generated_at,
250
+ "sourceLineageName": self.graph.name,
251
+ "format": "Apache Atlas",
252
+ "version": "2.0"
253
+ },
254
+ "atlasVersion": "2.3.0",
255
+ "cluster": self.cluster_name,
256
+ "entities": entities,
257
+ "referredEntities": referred_entities,
258
+ "summary": {
259
+ "totalEntities": len(entities),
260
+ "databases": list(databases),
261
+ "entityTypes": list(set(e["typeName"] for e in entities)),
262
+ "processCount": len(self.graph.edges)
263
+ }
264
+ }
tests/test_app.py CHANGED
@@ -96,6 +96,21 @@ class TestExporters(unittest.TestCase):
96
  self.assertIn("Alation", output)
97
  self.assertIn("Node A", output)
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  class TestSampleDataLoading(unittest.TestCase):
101
  def test_load_sample_simple(self):
 
96
  self.assertIn("Alation", output)
97
  self.assertIn("Node A", output)
98
 
99
+ def test_atlas_export(self):
100
+ from exporters import LineageGraph, LineageNode, LineageEdge, AtlasExporter
101
+
102
+ graph = LineageGraph(name="test")
103
+ graph.add_node(LineageNode(id="a", name="Node A", type="table"))
104
+ graph.add_node(LineageNode(id="b", name="Node B", type="table"))
105
+ graph.add_edge(LineageEdge(source="a", target="b", type="transform"))
106
+
107
+ exporter = AtlasExporter(graph)
108
+ output = exporter.export()
109
+
110
+ self.assertIn("Atlas", output)
111
+ self.assertIn("Node A", output)
112
+ self.assertIn("entities", output)
113
+
114
 
115
  class TestSampleDataLoading(unittest.TestCase):
116
  def test_load_sample_simple(self):