Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: strict mimetype validation #70

Merged
merged 3 commits into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 5 additions & 19 deletions openhands_aci/editor/editor.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import mimetypes
import os
import re
import shutil
import tempfile
from pathlib import Path
from typing import Literal, get_args

from binaryornot.check import is_binary

from openhands_aci.linter import DefaultLinter
from openhands_aci.utils.shell import run_shell_cmd

Expand Down Expand Up @@ -461,26 +462,11 @@ def validate_file(self, path: Path) -> None:
reason=f'File is too large ({file_size / 1024 / 1024:.1f}MB). Maximum allowed size is {int(max_size / 1024 / 1024)}MB.',
)

# Check if file is binary
mime_type, _ = mimetypes.guess_type(str(path))
if mime_type is None:
# If mime_type is None, try to detect if it's binary by reading first chunk
try:
chunk = open(path, 'rb').read(1024)
if b'\0' in chunk: # Common way to detect binary files
raise FileValidationError(
path=str(path),
reason='File appears to be binary. Only text files can be edited.',
)
except Exception as e:
raise FileValidationError(
path=str(path), reason=f'Error checking file type: {str(e)}'
)
elif not mime_type.startswith('text/'):
# Known non-text mime type
# Check file type
if is_binary(str(path)):
raise FileValidationError(
path=str(path),
reason=f'File type {mime_type} is not supported. Only text files can be edited.',
reason='File appears to be binary. Only text files can be edited.',
)

def read_file(
Expand Down
31 changes: 29 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "openhands-aci"
version = "0.2.1"
version = "0.2.2"
description = "An Agent-Computer Interface (ACI) designed for software development agents OpenHands."
authors = ["OpenHands"]
license = "MIT"
Expand All @@ -22,6 +22,7 @@ grep-ast = "0.3.3"
diskcache = "^5.6.3"
flake8 = "*"
whatthepatch = "^1.0.6"
binaryornot = "^0.4.4"


[tool.poetry.group.dev.dependencies]
Expand Down
33 changes: 29 additions & 4 deletions tests/integration/editor/test_file_validation.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,59 @@
"""Tests for file validation in file editor."""

import os
from pathlib import Path

from openhands_aci.editor import file_editor

from .conftest import parse_result


def test_file_validation(temp_file):
"""Test file validation for various file types."""
# Ensure temp_file has .sql suffix
temp_file_sql = Path(temp_file).with_suffix('.sql')
os.rename(temp_file, temp_file_sql)

# Test binary file
with open(temp_file, 'wb') as f:
with open(temp_file_sql, 'wb') as f:
f.write(b'Some text\x00with binary\x00content')

result = file_editor(
command='view',
path=temp_file,
path=str(temp_file_sql),
enable_linting=False,
)
result_json = parse_result(result)
assert 'binary' in result_json['formatted_output_and_error'].lower()

# Test large file
large_size = 11 * 1024 * 1024 # 11MB
with open(temp_file, 'w') as f:
with open(temp_file_sql, 'w') as f:
f.write('x' * large_size)

result = file_editor(
command='view',
path=temp_file,
path=str(temp_file_sql),
enable_linting=False,
)
result_json = parse_result(result)
assert 'too large' in result_json['formatted_output_and_error']
assert '10MB' in result_json['formatted_output_and_error']

# Test SQL file
sql_content = """
SELECT *
FROM users
WHERE id = 1;
"""
with open(temp_file_sql, 'w') as f:
f.write(sql_content)

result = file_editor(
command='view',
path=str(temp_file_sql),
enable_linting=False,
)
result_json = parse_result(result)
assert 'SELECT *' in result_json['formatted_output_and_error']
assert 'binary' not in result_json['formatted_output_and_error'].lower()
32 changes: 20 additions & 12 deletions tests/integration/editor/test_peak_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ def get_memory_info():
"""Get current and peak memory usage in bytes."""
process = psutil.Process(os.getpid())
rss = process.memory_info().rss
peak_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024 # Convert KB to bytes
peak_rss = (
resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024
) # Convert KB to bytes
return {
'rss': rss,
'peak_rss': peak_rss,
Expand All @@ -28,17 +30,17 @@ def create_test_file(path: Path, size_mb: float = 5.0):
line_size = 100 # bytes per line approximately
num_lines = int((size_mb * 1024 * 1024) // line_size)

print(f"\nCreating test file with {num_lines} lines...")
print(f'\nCreating test file with {num_lines} lines...')
with open(path, 'w') as f:
for i in range(num_lines):
f.write(f'Line {i}: ' + 'x' * (line_size - 10) + '\n')

actual_size = os.path.getsize(path)
print(f"File created, size: {actual_size / 1024 / 1024:.2f} MB")
print(f'File created, size: {actual_size / 1024 / 1024:.2f} MB')
return actual_size


def set_memory_limit(file_size: int, multiplier: float = 1.5):
def set_memory_limit(file_size: int, multiplier: float = 2.0):
"""Set memory limit to multiplier * file_size."""
# Add base memory for pytest and other processes (100MB)
base_memory = 100 * 1024 * 1024 # 100MB
Expand All @@ -50,11 +52,13 @@ def set_memory_limit(file_size: int, multiplier: float = 1.5):
current_usage = psutil.Process().memory_info().rss
if memory_limit > current_usage:
resource.setrlimit(resource.RLIMIT_AS, (memory_limit, hard))
print(f"Memory limit set to {memory_limit / 1024 / 1024:.2f} MB")
print(f'Memory limit set to {memory_limit / 1024 / 1024:.2f} MB')
else:
print(f"Warning: Current memory usage ({current_usage / 1024 / 1024:.2f} MB) higher than limit ({memory_limit / 1024 / 1024:.2f} MB)")
print(
f'Warning: Current memory usage ({current_usage / 1024 / 1024:.2f} MB) higher than limit ({memory_limit / 1024 / 1024:.2f} MB)'
)
except Exception as e:
print(f"Warning: Could not set memory limit: {str(e)}")
print(f'Warning: Could not set memory limit: {str(e)}')
return memory_limit


Expand Down Expand Up @@ -82,6 +86,7 @@ def test_str_replace_peak_memory():

# Force Python to release file handles and clear buffers
import gc

gc.collect()

# Get initial memory usage
Expand All @@ -93,7 +98,7 @@ def test_str_replace_peak_memory():

# Perform str_replace operation
try:
result = file_editor(
_ = file_editor(
command='str_replace',
path=path,
old_str='Line 5000', # Replace a line in the middle
Expand All @@ -118,6 +123,7 @@ def test_insert_peak_memory():

# Force Python to release file handles and clear buffers
import gc

gc.collect()

# Get initial memory usage
Expand All @@ -129,7 +135,7 @@ def test_insert_peak_memory():

# Perform insert operation
try:
result = file_editor(
_ = file_editor(
command='insert',
path=path,
insert_line=5000, # Insert in the middle
Expand All @@ -154,6 +160,7 @@ def test_view_peak_memory():

# Force Python to release file handles and clear buffers
import gc

gc.collect()

# Get initial memory usage
Expand All @@ -165,7 +172,7 @@ def test_view_peak_memory():

# Test viewing specific lines
try:
result = file_editor(
_ = file_editor(
command='view',
path=path,
view_range=[5000, 5100], # View 100 lines from middle
Expand All @@ -189,6 +196,7 @@ def test_view_full_file_peak_memory():

# Force Python to release file handles and clear buffers
import gc

gc.collect()

# Get initial memory usage
Expand All @@ -200,7 +208,7 @@ def test_view_full_file_peak_memory():

# Test viewing entire file
try:
result = file_editor(
_ = file_editor(
command='view',
path=path,
enable_linting=False,
Expand All @@ -212,4 +220,4 @@ def test_view_full_file_peak_memory():
pytest.fail('Memory limit exceeded - peak memory usage too high')
raise

check_memory_usage(initial['max'], file_size, 'view_full')
check_memory_usage(initial['max'], file_size, 'view_full')