diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py index 0311e76d89..42d996f756 100644 --- a/pyiceberg/conversions.py +++ b/pyiceberg/conversions.py @@ -343,7 +343,6 @@ def _(_: PrimitiveType, b: bytes) -> int: return _INT_STRUCT.unpack(b)[0] -@from_bytes.register(LongType) @from_bytes.register(TimeType) @from_bytes.register(TimestampType) @from_bytes.register(TimestamptzType) @@ -353,6 +352,14 @@ def _(_: PrimitiveType, b: bytes) -> int: return _LONG_STRUCT.unpack(b)[0] +@from_bytes.register(LongType) +def _(_: PrimitiveType, b: bytes) -> int: + if len(b) < 8: + # If the length is 4 bytes, it is a promoted IntegerType + return _INT_STRUCT.unpack(b)[0] + return _LONG_STRUCT.unpack(b)[0] + + @from_bytes.register(FloatType) def _(_: FloatType, b: bytes) -> float: return _FLOAT_STRUCT.unpack(b)[0] @@ -360,6 +367,9 @@ def _(_: FloatType, b: bytes) -> float: @from_bytes.register(DoubleType) def _(_: DoubleType, b: bytes) -> float: + if len(b) < 8: + # If the length is 4 bytes, it is a promoted FloatType + return _FLOAT_STRUCT.unpack(b)[0] return _DOUBLE_STRUCT.unpack(b)[0] diff --git a/tests/expressions/test_evaluator.py b/tests/expressions/test_evaluator.py index 03a33e4c78..b8e4d87044 100644 --- a/tests/expressions/test_evaluator.py +++ b/tests/expressions/test_evaluator.py @@ -41,7 +41,14 @@ Or, StartsWith, ) -from pyiceberg.expressions.visitors import _InclusiveMetricsEvaluator, _StrictMetricsEvaluator +from pyiceberg.expressions.visitors import ( + ROWS_CANNOT_MATCH, + ROWS_MIGHT_MATCH, + ROWS_MIGHT_NOT_MATCH, + ROWS_MUST_MATCH, + _InclusiveMetricsEvaluator, + _StrictMetricsEvaluator, +) from pyiceberg.manifest import DataFile, FileFormat from pyiceberg.schema import Schema from pyiceberg.typedef import Record @@ -50,6 +57,7 @@ FloatType, IcebergType, IntegerType, + LongType, NestedField, PrimitiveType, StringType, @@ -1463,3 +1471,81 @@ def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("no_nulls", {"abc", "def"})).eval(strict_data_file_1) assert not should_read, "Should not match: no_nulls field does not have bounds" + + +@pytest.mark.parametrize( + "file_type, evolved_type, lower_bound, upper_bound, op, lit, expected", + [ + # Int -> Long + (IntegerType(), LongType(), 30, 79, GreaterThan, 100, ROWS_CANNOT_MATCH), + (IntegerType(), LongType(), 30, 79, LessThan, 50, ROWS_MIGHT_MATCH), + # Float -> Double + (FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 100.0, ROWS_CANNOT_MATCH), + (FloatType(), DoubleType(), 30.0, 79.0, LessThan, 50.0, ROWS_MIGHT_MATCH), + ], +) +def test_inclusive_metrics_eval_bounds_after_promotion( + file_type: PrimitiveType, + evolved_type: PrimitiveType, + lower_bound: Any, + upper_bound: Any, + op: Any, + lit: Any, + expected: bool, +) -> None: + schema = Schema(NestedField(1, "col", evolved_type, required=True)) + + data_file = DataFile.from_args( + file_path="file_1.parquet", + file_format=FileFormat.PARQUET, + partition={}, + record_count=100, + file_size_in_bytes=1024, + lower_bounds={1: to_bytes(file_type, lower_bound)}, + upper_bounds={1: to_bytes(file_type, upper_bound)}, + ) + + evaluator = _InclusiveMetricsEvaluator(schema, op("col", lit)) + assert evaluator.eval(data_file) == expected + + +@pytest.mark.parametrize( + "file_type, evolved_type, lower_bound, upper_bound, op, lit, expected", + [ + # Int -> Long + (IntegerType(), LongType(), 30, 79, GreaterThan, 20, ROWS_MUST_MATCH), + (IntegerType(), LongType(), 30, 79, GreaterThan, 100, ROWS_MIGHT_NOT_MATCH), + (IntegerType(), LongType(), 30, 79, LessThan, 100, ROWS_MUST_MATCH), + (IntegerType(), LongType(), 30, 79, LessThan, 20, ROWS_MIGHT_NOT_MATCH), + # Float -> Double + (FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 20.0, ROWS_MUST_MATCH), + (FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 100.0, ROWS_MIGHT_NOT_MATCH), + (FloatType(), DoubleType(), 30.0, 79.0, LessThan, 100.0, ROWS_MUST_MATCH), + (FloatType(), DoubleType(), 30.0, 79.0, LessThan, 20.0, ROWS_MIGHT_NOT_MATCH), + ], +) +def test_strict_metrics_eval_bounds_after_promotion( + file_type: PrimitiveType, + evolved_type: PrimitiveType, + lower_bound: Any, + upper_bound: Any, + op: Any, + lit: Any, + expected: bool, +) -> None: + schema = Schema(NestedField(1, "col", evolved_type, required=True)) + + data_file = DataFile.from_args( + file_path="file_1.parquet", + file_format=FileFormat.PARQUET, + partition={}, + record_count=100, + file_size_in_bytes=1024, + lower_bounds={1: to_bytes(file_type, lower_bound)}, + upper_bounds={1: to_bytes(file_type, upper_bound)}, + null_value_counts={1: 0}, + nan_value_counts={1: 0}, + ) + + evaluator = _StrictMetricsEvaluator(schema, op("col", lit)) + assert evaluator.eval(data_file) == expected