Skip to content

Commit 10d97d9

Browse files
committed
Add duplicate file analysis for Android
1 parent 67a752d commit 10d97d9

File tree

5 files changed

+156
-10
lines changed

5 files changed

+156
-10
lines changed

src/launchpad/analyzers/android.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
from datetime import datetime, timezone
55

66
from ..artifacts import AAB, APK, AndroidArtifact, ZippedAAB, ZippedAPK
7-
from ..models.android import AndroidAnalysisResults, AndroidAppInfo
7+
from ..insights.common import DuplicateFilesInsight, InsightsInput
8+
from ..models.android import AndroidAnalysisResults, AndroidAppInfo, AndroidInsightResults
89
from ..models.common import FileAnalysis, FileInfo
910
from ..models.treemap import FILE_TYPE_TO_TREEMAP_TYPE, TreemapType
1011
from ..utils.file_utils import calculate_file_hash
@@ -21,6 +22,17 @@
2122
class AndroidAnalyzer:
2223
"""Analyzer for Android apps (.apk, .aab files)."""
2324

25+
def __init__(
26+
self,
27+
skip_insights: bool = False,
28+
) -> None:
29+
"""Initialize the Android analyzer.
30+
31+
Args:
32+
skip_insights: Skip insights generation for faster analysis
33+
"""
34+
self.skip_insights = skip_insights
35+
2436
def analyze(self, artifact: AndroidArtifact) -> AndroidAnalysisResults:
2537
manifest_dict = artifact.get_manifest().model_dump()
2638
start_time = time.time()
@@ -57,13 +69,26 @@ def analyze(self, artifact: AndroidArtifact) -> AndroidAnalysisResults:
5769

5870
treemap = treemap_builder.build_file_treemap(file_analysis)
5971

72+
insights: AndroidInsightResults | None = None
73+
if not self.skip_insights:
74+
logger.info("Generating insights from analysis results")
75+
insights_input = InsightsInput(
76+
app_info=app_info,
77+
file_analysis=file_analysis,
78+
treemap=treemap,
79+
)
80+
insights = AndroidInsightResults(
81+
duplicate_files=DuplicateFilesInsight().__call__(insights_input),
82+
)
83+
6084
analysis_duration = time.time() - start_time
6185
return AndroidAnalysisResults(
6286
generated_at=datetime.now(timezone.utc),
6387
analysis_duration=analysis_duration,
6488
app_info=app_info,
6589
treemap=treemap,
6690
file_analysis=file_analysis,
91+
insights=insights,
6792
)
6893

6994
def _get_file_analysis(self, apks: list[APK]) -> FileAnalysis:
@@ -104,14 +129,17 @@ def _get_file_analysis(self, apks: list[APK]) -> FileAnalysis:
104129
merged_size,
105130
)
106131

132+
# For merged files, we need to recalculate the hash since the content
133+
# might be different across APKs. We'll use the hash of the current file.
134+
merged_file_hash = calculate_file_hash(file_path, algorithm="md5")
135+
107136
# Create new FileInfo with merged size
108137
merged_file_info = FileInfo(
109138
path=relative_path,
110139
size=merged_size,
111140
file_type=file_type,
112141
treemap_type=treemap_type,
113-
# Intentionally igoring hash of merged file
114-
hash_md5="",
142+
hash_md5=merged_file_hash,
115143
)
116144
path_to_file_info[relative_path] = merged_file_info
117145
else:

src/launchpad/cli.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,10 @@ def android(
216216
else:
217217
_print_android_table_output(results, quiet)
218218

219+
if not quiet:
220+
console.print(f"\n[bold green]✓[/bold green] Analysis completed in {duration:.2f}s")
221+
_print_android_summary(results)
222+
219223
except Exception as e:
220224
if verbose:
221225
console.print_exception()
@@ -376,6 +380,24 @@ def _print_file_analysis_table(file_analysis: FileAnalysis) -> None:
376380
console.print()
377381

378382

383+
def _print_android_summary(results: AndroidAnalysisResults) -> None:
384+
"""Print a brief summary of the analysis."""
385+
file_analysis = results.file_analysis
386+
insights = results.insights
387+
388+
console.print("\n[bold]Summary:[/bold]")
389+
console.print(f"• App name: [cyan]{results.app_info.name}[/cyan]")
390+
console.print(f"• Package name: [cyan]{results.app_info.package_name}[/cyan]")
391+
console.print(f"• Total app size: [cyan]{_format_bytes(file_analysis.total_size)}[/cyan]")
392+
console.print(f"• File count: [cyan]{file_analysis.file_count:,}[/cyan]")
393+
394+
if insights and insights.duplicate_files and insights.duplicate_files.total_savings > 0:
395+
console.print(
396+
f"• Potential savings from duplicates: "
397+
f"[yellow]{_format_bytes(insights.duplicate_files.total_savings)}[/yellow]"
398+
)
399+
400+
379401
def _print_apple_summary(results: AppleAnalysisResults) -> None:
380402
"""Print a brief summary of the analysis."""
381403
file_analysis = results.file_analysis

src/launchpad/insights/common.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,21 @@
44

55
from collections import defaultdict
66
from dataclasses import dataclass
7-
from typing import Dict, List, Protocol, TypeVar
7+
from typing import Dict, List, Protocol, Sequence, TypeVar
88

9-
from launchpad.models.common import FileAnalysis, FileInfo
9+
from launchpad.models.common import BaseAppInfo, BaseBinaryAnalysis, FileAnalysis, FileInfo
1010
from launchpad.models.insights import DuplicateFilesInsightResult
1111
from launchpad.models.treemap import TreemapResults
1212

13-
from ..models.apple import AppleAppInfo, MachOBinaryAnalysis
14-
1513
T_co = TypeVar("T_co", covariant=True)
1614

1715

1816
@dataclass
1917
class InsightsInput:
20-
app_info: AppleAppInfo
18+
app_info: BaseAppInfo
2119
file_analysis: FileAnalysis
2220
treemap: TreemapResults | None
23-
binary_analysis: List[MachOBinaryAnalysis]
21+
binary_analysis: Sequence[BaseBinaryAnalysis] | None = None
2422

2523

2624
class Insight(Protocol[T_co]):

src/launchpad/models/android.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
from pydantic import ConfigDict, Field
1+
from pydantic import BaseModel, ConfigDict, Field
22

33
from .common import BaseAnalysisResults, BaseAppInfo
4+
from .insights import DuplicateFilesInsightResult
45

56

67
class AndroidAppInfo(BaseAppInfo):
@@ -9,8 +10,19 @@ class AndroidAppInfo(BaseAppInfo):
910
package_name: str = Field(..., description="Android package name")
1011

1112

13+
class AndroidInsightResults(BaseModel):
14+
"""Collection of all insight results for Android."""
15+
16+
model_config = ConfigDict(frozen=True)
17+
18+
duplicate_files: DuplicateFilesInsightResult | None = Field(None, description="Duplicate files analysis")
19+
20+
1221
class AndroidAnalysisResults(BaseAnalysisResults):
1322
"""Complete Android analysis results."""
1423

1524
model_config = ConfigDict(frozen=True)
1625
app_info: AndroidAppInfo = Field(..., description="Android app information")
26+
insights: AndroidInsightResults | None = Field(
27+
description="Generated insights from the analysis",
28+
)

tests/unit/test_android_analyzer.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""Tests for Android analyzer with duplicate file detection."""
2+
3+
from pathlib import Path
4+
5+
import pytest
6+
7+
from launchpad.analyzers.android import AndroidAnalyzer
8+
from launchpad.artifacts import ArtifactFactory
9+
10+
11+
@pytest.fixture
12+
def test_apk_path() -> Path:
13+
return Path("tests/_fixtures/android/hn.apk")
14+
15+
16+
@pytest.fixture
17+
def android_analyzer() -> AndroidAnalyzer:
18+
return AndroidAnalyzer()
19+
20+
21+
class TestAndroidAnalyzer:
22+
def test_analyze_with_duplicate_detection(self, test_apk_path: Path, android_analyzer: AndroidAnalyzer) -> None:
23+
"""Test that Android analyzer includes duplicate file detection."""
24+
artifact = ArtifactFactory.from_path(test_apk_path)
25+
results = android_analyzer.analyze(artifact)
26+
27+
# Verify basic analysis results
28+
assert results.app_info.name == "Hacker News"
29+
assert results.app_info.package_name == "com.emergetools.hackernews"
30+
assert results.file_analysis is not None
31+
assert len(results.file_analysis.files) > 0
32+
33+
# Verify insights are generated
34+
assert results.insights is not None
35+
assert results.insights.duplicate_files is not None
36+
37+
# Verify duplicate files insight structure
38+
duplicate_insight = results.insights.duplicate_files
39+
assert hasattr(duplicate_insight, "files")
40+
assert hasattr(duplicate_insight, "total_savings")
41+
assert hasattr(duplicate_insight, "duplicate_count")
42+
assert isinstance(duplicate_insight.total_savings, int)
43+
assert isinstance(duplicate_insight.duplicate_count, int)
44+
assert duplicate_insight.total_savings == 51709
45+
assert duplicate_insight.duplicate_count == 52
46+
47+
def test_duplicate_files_have_hashes(self, test_apk_path: Path, android_analyzer: AndroidAnalyzer) -> None:
48+
"""Test that all files have MD5 hashes for duplicate detection."""
49+
artifact = ArtifactFactory.from_path(test_apk_path)
50+
results = android_analyzer.analyze(artifact)
51+
52+
for file_info in results.file_analysis.files:
53+
assert file_info.hash_md5 is not None
54+
assert len(file_info.hash_md5) > 0
55+
56+
def test_duplicate_detection_algorithm(self, test_apk_path: Path, android_analyzer: AndroidAnalyzer) -> None:
57+
"""Test that duplicate detection groups files by hash correctly."""
58+
artifact = ArtifactFactory.from_path(test_apk_path)
59+
results = android_analyzer.analyze(artifact)
60+
61+
# Group files by hash to verify the algorithm
62+
files_by_hash = {}
63+
for file_info in results.file_analysis.files:
64+
if file_info.hash_md5 not in files_by_hash:
65+
files_by_hash[file_info.hash_md5] = []
66+
files_by_hash[file_info.hash_md5].append(file_info)
67+
68+
# Check that the duplicate detection found the same groups
69+
duplicate_insight = results.insights.duplicate_files
70+
expected_duplicate_files = []
71+
expected_total_savings = 0
72+
73+
for file_list in files_by_hash.values():
74+
if len(file_list) > 1:
75+
# Calculate potential savings (all files except one)
76+
total_file_size = sum(f.size for f in file_list)
77+
savings = total_file_size - file_list[0].size
78+
79+
if savings > 0:
80+
# Add all files except the first one (which we'll keep)
81+
expected_duplicate_files.extend(file_list[1:])
82+
expected_total_savings += savings
83+
84+
# Verify the insight results match our manual calculation
85+
assert len(duplicate_insight.files) == len(expected_duplicate_files)
86+
assert duplicate_insight.total_savings == expected_total_savings

0 commit comments

Comments
 (0)