From 6d3f253a46924312d14e30ffc59c9987613bc61d Mon Sep 17 00:00:00 2001 From: Breakthrough Date: Fri, 17 Apr 2026 23:50:27 -0400 Subject: [PATCH] [api] Make sure all values are temporal --- scenedetect.cfg | 3 +- scenedetect/_cli/__init__.py | 10 +- scenedetect/_cli/config.py | 2 +- scenedetect/detector.py | 35 +++++- scenedetect/detectors/adaptive_detector.py | 7 +- scenedetect/detectors/content_detector.py | 7 +- scenedetect/detectors/hash_detector.py | 7 +- scenedetect/detectors/histogram_detector.py | 12 +- scenedetect/detectors/threshold_detector.py | 7 +- scenedetect/detectors/transnet_v2.py | 2 +- scenedetect/output/image.py | 127 ++++++++------------ tests/test_detectors.py | 22 ++++ tests/test_output.py | 32 +++++ website/pages/changelog.md | 2 + 14 files changed, 170 insertions(+), 105 deletions(-) diff --git a/scenedetect.cfg b/scenedetect.cfg index fd6241cd..f0901cc0 100644 --- a/scenedetect.cfg +++ b/scenedetect.cfg @@ -227,7 +227,8 @@ # Compression amount for png images (0 to 9). Only affects size, not quality. #compression = 3 -# Number of frames to ignore around each scene cut when selecting frames. +# Padding around each scene cut when selecting frames. Accepts a number of frames (1), +# seconds with `s` suffix (0.1s), or timecode (00:00:00.100). #frame-margin = 1 # Resize by scale factor (0.5 = half, 1.0 = same, 2.0 = double). diff --git a/scenedetect/_cli/__init__.py b/scenedetect/_cli/__init__.py index a0c639d2..88dc9654 100644 --- a/scenedetect/_cli/__init__.py +++ b/scenedetect/_cli/__init__.py @@ -1397,11 +1397,11 @@ def split_video_command( @click.option( "-m", "--frame-margin", - metavar="N", + metavar="DURATION", default=None, - type=click.INT, - help="Number of frames to ignore at beginning/end of scenes when saving images. Controls temporal padding on scene boundaries.%s" - % (USER_CONFIG.get_help_string("save-images", "num-images")), + type=click.STRING, + help="Padding around the beginning/end of each scene used when selecting which frames to extract. DURATION can be specified in frames (-m 1), in seconds with `s` suffix (-m 0.1s), or timecode (-m 00:00:00.100).%s" + % (USER_CONFIG.get_help_string("save-images", "frame-margin")), ) @click.option( "--scale", @@ -1441,7 +1441,7 @@ def save_images_command( quality: ty.Optional[int] = None, png: bool = False, compression: ty.Optional[int] = None, - frame_margin: ty.Optional[int] = None, + frame_margin: ty.Optional[str] = None, scale: ty.Optional[float] = None, height: ty.Optional[int] = None, width: ty.Optional[int] = None, diff --git a/scenedetect/_cli/config.py b/scenedetect/_cli/config.py index ee851da8..80fc082f 100644 --- a/scenedetect/_cli/config.py +++ b/scenedetect/_cli/config.py @@ -412,7 +412,7 @@ class XmlFormat(Enum): "compression": RangeValue(3, min_val=0, max_val=9), "filename": "$VIDEO_NAME-Scene-$SCENE_NUMBER-$IMAGE_NUMBER", "format": "jpeg", - "frame-margin": 1, + "frame-margin": TimecodeValue(1), "height": 0, "num-images": 3, "output": None, diff --git a/scenedetect/detector.py b/scenedetect/detector.py index 7c3e1b70..8afd0f71 100644 --- a/scenedetect/detector.py +++ b/scenedetect/detector.py @@ -24,6 +24,7 @@ event (in, out, cut, etc...). """ +import math import typing as ty from abc import ABC, abstractmethod from enum import Enum @@ -114,15 +115,26 @@ class Mode(Enum): SUPPRESS = 1 """Suppress consecutive cuts until the filter length has passed.""" - def __init__(self, mode: Mode, length: int): + def __init__(self, mode: Mode, length: ty.Union[int, float, str]): """ Arguments: mode: The mode to use when enforcing `length`. - length: Number of frames to use when filtering cuts. + length: Minimum scene length. Accepts an `int` (number of frames), `float` (seconds), + or `str` (timecode, e.g. ``"0.6s"`` or ``"00:00:00.600"``). """ self._mode = mode - self._filter_length = length # Number of frames to use for activating the filter. - self._filter_secs: ty.Optional[float] = None # Threshold in seconds, computed on first use. + # Frame count (int) and seconds (float) representations of `length`. Exactly one is + # populated up front; the other is computed on the first frame once the framerate is + # known. Temporal inputs (float/non-digit str) populate `_filter_secs`; integer inputs + # (int/digit str) populate `_filter_length`. + self._filter_length: int = 0 + self._filter_secs: ty.Optional[float] = None + if isinstance(length, float): + self._filter_secs = length + elif isinstance(length, str) and not length.strip().isdigit(): + self._filter_secs = FrameTimecode(timecode=length, fps=100.0).seconds + else: + self._filter_length = int(length) self._last_above = None # Last frame above threshold. self._merge_enabled = False # Used to disable merging until at least one cut was found. self._merge_triggered = False # True when the merge filter is active. @@ -130,10 +142,21 @@ def __init__(self, mode: Mode, length: int): @property def max_behind(self) -> int: - return 0 if self._mode == FlashFilter.Mode.SUPPRESS else self._filter_length + if self._mode == FlashFilter.Mode.SUPPRESS: + return 0 + if self._filter_secs is not None: + # Estimate using 240fps so the event buffer is large enough for any reasonable input. + return math.ceil(self._filter_secs * 240.0) + return self._filter_length + + @property + def _is_disabled(self) -> bool: + if self._filter_secs is not None: + return self._filter_secs <= 0.0 + return self._filter_length <= 0 def filter(self, timecode: FrameTimecode, above_threshold: bool) -> ty.List[FrameTimecode]: - if not self._filter_length > 0: + if self._is_disabled: return [timecode] if above_threshold else [] if self._last_above is None: self._last_above = timecode diff --git a/scenedetect/detectors/adaptive_detector.py b/scenedetect/detectors/adaptive_detector.py index 7a0a23af..f1917d77 100644 --- a/scenedetect/detectors/adaptive_detector.py +++ b/scenedetect/detectors/adaptive_detector.py @@ -38,7 +38,7 @@ class AdaptiveDetector(ContentDetector): def __init__( self, adaptive_threshold: float = 3.0, - min_scene_len: int = 15, + min_scene_len: ty.Union[int, float, str] = 15, window_width: int = 2, min_content_val: float = 15.0, weights: ContentDetector.Components = ContentDetector.DEFAULT_COMPONENT_WEIGHTS, @@ -49,8 +49,9 @@ def __init__( Arguments: adaptive_threshold: Threshold (float) that score ratio must exceed to trigger a new scene (see frame metric adaptive_ratio in stats file). - min_scene_len: Once a cut is detected, this many frames must pass before a new one can - be added to the scene list. Can be an int or FrameTimecode type. + min_scene_len: Once a cut is detected, this much time must pass before a new one can + be added to the scene list. Accepts an int (frames), float (seconds), or + str (e.g. ``"0.6s"``, ``"00:00:00.600"``). window_width: Size of window (number of frames) before and after each frame to average together in order to detect deviations from the mean. Must be at least 1. min_content_val: Minimum threshold (float) that the content_val must exceed in order to diff --git a/scenedetect/detectors/content_detector.py b/scenedetect/detectors/content_detector.py index 6cf757fa..268233c3 100644 --- a/scenedetect/detectors/content_detector.py +++ b/scenedetect/detectors/content_detector.py @@ -104,7 +104,7 @@ class _FrameData: def __init__( self, threshold: float = 27.0, - min_scene_len: int = 15, + min_scene_len: ty.Union[int, float, str] = 15, weights: "ContentDetector.Components" = DEFAULT_COMPONENT_WEIGHTS, luma_only: bool = False, kernel_size: ty.Optional[int] = None, @@ -113,8 +113,9 @@ def __init__( """ Arguments: threshold: Threshold the average change in pixel intensity must exceed to trigger a cut. - min_scene_len: Once a cut is detected, this many frames must pass before a new one can - be added to the scene list. Can be an int or FrameTimecode type. + min_scene_len: Once a cut is detected, this much time must pass before a new one can + be added to the scene list. Accepts an int (frames), float (seconds), or + str (e.g. ``"0.6s"``, ``"00:00:00.600"``). weights: Weight to place on each component when calculating frame score (`content_val` in a statsfile, the value `threshold` is compared against). luma_only: If True, only considers changes in the luminance channel of the video. diff --git a/scenedetect/detectors/hash_detector.py b/scenedetect/detectors/hash_detector.py index 484f49d5..af0994d1 100644 --- a/scenedetect/detectors/hash_detector.py +++ b/scenedetect/detectors/hash_detector.py @@ -41,8 +41,9 @@ class HashDetector(SceneDetector): size: Size of square of low frequency data to use for the DCT lowpass: How much high frequency information to filter from the DCT. A value of 2 means keep lower 1/2 of the frequency data, 4 means only keep 1/4, etc... - min_scene_len: Once a cut is detected, this many frames must pass before a new one can - be added to the scene list. Can be an int or FrameTimecode type. + min_scene_len: Once a cut is detected, this much time must pass before a new one can + be added to the scene list. Accepts an int (frames), float (seconds), or + str (e.g. ``"0.6s"``, ``"00:00:00.600"``). """ def __init__( @@ -50,7 +51,7 @@ def __init__( threshold: float = 0.395, size: int = 16, lowpass: int = 2, - min_scene_len: int = 15, + min_scene_len: ty.Union[int, float, str] = 15, ): super(HashDetector, self).__init__() self._threshold = threshold diff --git a/scenedetect/detectors/histogram_detector.py b/scenedetect/detectors/histogram_detector.py index 812c5852..8502e1e5 100644 --- a/scenedetect/detectors/histogram_detector.py +++ b/scenedetect/detectors/histogram_detector.py @@ -30,7 +30,12 @@ class HistogramDetector(SceneDetector): METRIC_KEYS = ["hist_diff"] - def __init__(self, threshold: float = 0.05, bins: int = 256, min_scene_len: int = 15): + def __init__( + self, + threshold: float = 0.05, + bins: int = 256, + min_scene_len: ty.Union[int, float, str] = 15, + ): """ Arguments: threshold: maximum relative difference between 0.0 and 1.0 that the histograms can @@ -38,8 +43,9 @@ def __init__(self, threshold: float = 0.05, bins: int = 256, min_scene_len: int YUV, and normalized based on the number of bins. Higher dicfferences imply greater change in content, so larger threshold values are less sensitive to cuts. bins: Number of bins to use for the histogram. - min_scene_len: Once a cut is detected, this many frames must pass before a new one can - be added to the scene list. Can be an int or FrameTimecode type. + min_scene_len: Once a cut is detected, this much time must pass before a new one can + be added to the scene list. Accepts an int (frames), float (seconds), or + str (e.g. ``"0.6s"``, ``"00:00:00.600"``). """ super().__init__() # Internally, threshold represents the correlation between two histograms and has values diff --git a/scenedetect/detectors/threshold_detector.py b/scenedetect/detectors/threshold_detector.py index 8d28cd62..edb63024 100644 --- a/scenedetect/detectors/threshold_detector.py +++ b/scenedetect/detectors/threshold_detector.py @@ -48,7 +48,7 @@ class Method(Enum): def __init__( self, threshold: float = 12, - min_scene_len: int = 15, + min_scene_len: ty.Union[int, float, str] = 15, fade_bias: float = 0.0, add_final_scene: bool = False, method: Method = Method.FLOOR, @@ -58,8 +58,9 @@ def __init__( Arguments: threshold: 8-bit intensity value that each pixel value (R, G, and B) must be <= to in order to trigger a fade in/out. - min_scene_len: Once a cut is detected, this many frames must pass before a new one can - be added to the scene list. Can be an int or FrameTimecode type. + min_scene_len: Once a cut is detected, this much time must pass before a new one can + be added to the scene list. Accepts an int (frames), float (seconds), or + str (e.g. ``"0.6s"``, ``"00:00:00.600"``). fade_bias: Float between -1.0 and +1.0 representing the percentage of timecode skew for the start of a scene (-1.0 causing a cut at the fade-to-black, 0.0 in the middle, and +1.0 causing the cut to be diff --git a/scenedetect/detectors/transnet_v2.py b/scenedetect/detectors/transnet_v2.py index 752749cd..c559938e 100644 --- a/scenedetect/detectors/transnet_v2.py +++ b/scenedetect/detectors/transnet_v2.py @@ -135,7 +135,7 @@ def __init__( model_path: ty.Union[str, Path] = "tests/resources/transnetv2.onnx", onnx_providers: ty.Union[ty.List[str], None] = None, threshold: float = 0.5, - min_scene_len: int = 15, + min_scene_len: ty.Union[int, float, str] = 15, filter_mode: FlashFilter.Mode = FlashFilter.Mode.MERGE, ): super().__init__() diff --git a/scenedetect/output/image.py b/scenedetect/output/image.py index d5cf00de..842e1460 100644 --- a/scenedetect/output/image.py +++ b/scenedetect/output/image.py @@ -34,6 +34,41 @@ logger = logging.getLogger("pyscenedetect") +def _generate_timecode_list( + scene_list: SceneList, + num_images: int, + frame_margin: ty.Union[int, float, str], +) -> ty.List[ty.List[FrameTimecode]]: + """Generate per-scene image timecodes using PTS-accurate seconds-based timing. + + `frame_margin` accepts an int (frames), float (seconds), or str (e.g. ``"0.1s"``). + """ + framerate = scene_list[0][0].framerate + margin_secs = FrameTimecode(timecode=frame_margin, fps=framerate).seconds + result = [] + for start, end in scene_list: + duration_secs = (end - start).seconds + if duration_secs <= 0: + result.append([start] * num_images) + continue + segment_secs = duration_secs / num_images + timecodes = [] + for j in range(num_images): + seg_start = start.seconds + j * segment_secs + seg_end = start.seconds + (j + 1) * segment_secs + if num_images == 1: + t = start.seconds + duration_secs / 2.0 + elif j == 0: + t = min(seg_start + margin_secs, seg_end) + elif j == num_images - 1: + t = max(seg_end - margin_secs, seg_start) + else: + t = (seg_start + seg_end) / 2.0 + timecodes.append(FrameTimecode(t, fps=framerate)) + result.append(timecodes) + return result + + def _scale_image( image: np.ndarray, aspect_ratio: float, @@ -69,7 +104,7 @@ class _ImageExtractor: def __init__( self, num_images: int = 3, - frame_margin: int = 1, + frame_margin: ty.Union[int, float, str] = 1, image_extension: str = "jpg", imwrite_param: ty.Dict[str, ty.Union[int, None]] = None, image_name_template: str = "$VIDEO_NAME-Scene-$SCENE_NUMBER-$IMAGE_NUMBER", @@ -85,10 +120,10 @@ def __init__( Arguments: num_images: Number of images to generate for each scene. Minimum is 1. - frame_margin: Number of frames to pad each scene around the beginning - and end (e.g. moves the first/last image into the scene by N frames). - Can set to 0, but will result in some video files failing to extract - the very last frame. + frame_margin: Padding around the beginning and end of each scene used when + selecting which frames to extract. Accepts an int (frames), float (seconds), + or str (e.g. ``"0.1s"``, ``"00:00:00.100"``). Can be 0, but some video files + may then fail to extract the very last frame. image_extension: Type of image to save (must be one of 'jpg', 'png', or 'webp'). encoder_param: Quality/compression efficiency, based on type of image: 'jpg' / 'webp': Quality 0-100, higher is better quality. 100 is lossless for webp. @@ -296,31 +331,7 @@ def generate_timecode_list(self, scene_list: SceneList) -> ty.List[ty.List[Frame Uses PTS-accurate seconds-based timing so results are correct for both CFR and VFR video. """ - framerate = scene_list[0][0].framerate - # Convert frame_margin to seconds using the nominal framerate. - margin_secs = self._frame_margin / framerate - result = [] - for start, end in scene_list: - duration_secs = (end - start).seconds - if duration_secs <= 0: - result.append([start] * self._num_images) - continue - segment_secs = duration_secs / self._num_images - timecodes = [] - for j in range(self._num_images): - seg_start = start.seconds + j * segment_secs - seg_end = start.seconds + (j + 1) * segment_secs - if self._num_images == 1: - t = start.seconds + duration_secs / 2.0 - elif j == 0: - t = min(seg_start + margin_secs, seg_end) - elif j == self._num_images - 1: - t = max(seg_end - margin_secs, seg_start) - else: - t = (seg_start + seg_end) / 2.0 - timecodes.append(FrameTimecode(t, fps=framerate)) - result.append(timecodes) - return result + return _generate_timecode_list(scene_list, self._num_images, self._frame_margin) def resize_image( self, @@ -336,7 +347,7 @@ def save_images( scene_list: SceneList, video: VideoStream, num_images: int = 3, - frame_margin: int = 1, + frame_margin: ty.Union[int, float, str] = 1, image_extension: str = "jpg", encoder_param: int = 95, image_name_template: str = "$VIDEO_NAME-Scene-$SCENE_NUMBER-$IMAGE_NUMBER", @@ -357,10 +368,10 @@ def save_images( video: A VideoStream object corresponding to the scene list. Note that the video will be closed/re-opened and seeked through. num_images: Number of images to generate for each scene. Minimum is 1. - frame_margin: Number of frames to pad each scene around the beginning - and end (e.g. moves the first/last image into the scene by N frames). - Can set to 0, but will result in some video files failing to extract - the very last frame. + frame_margin: Padding around the beginning and end of each scene used when + selecting which frames to extract. Accepts an int (frames), float (seconds), + or str (e.g. ``"0.1s"``, ``"00:00:00.100"``). Can be 0, but some video files + may then fail to extract the very last frame. image_extension: Type of image to save (must be one of 'jpg', 'png', or 'webp'). encoder_param: Quality/compression efficiency, based on type of image: 'jpg' / 'webp': Quality 0-100, higher is better quality. 100 is lossless for webp. @@ -398,8 +409,10 @@ def save_images( if not scene_list: return {} - if num_images <= 0 or frame_margin < 0: - raise ValueError() + if num_images <= 0: + raise ValueError("num_images must be greater than 0") + if isinstance(frame_margin, (int, float)) and frame_margin < 0: + raise ValueError("frame_margin must be non-negative") # TODO: Validate that encoder_param is within the proper range. # Should be between 0 and 100 (inclusive) for jpg/webp, and 1-9 for png. @@ -440,45 +453,7 @@ def save_images( image_num_format = "%0" image_num_format += str(math.floor(math.log(num_images, 10)) + 2) + "d" - framerate = scene_list[0][0]._rate - - # TODO(v1.0): Split up into multiple sub-expressions so auto-formatter works correctly. - timecode_list = [ - [ - FrameTimecode(int(f), fps=framerate) - for f in ( - # middle frames - a[len(a) // 2] - if (0 < j < num_images - 1) or num_images == 1 - # first frame - else min(a[0] + frame_margin, a[-1]) - if j == 0 - # last frame - else max(a[-1] - frame_margin, a[0]) - # for each evenly-split array of frames in the scene list - for j, a in enumerate(np.array_split(r, num_images)) - ) - ] - for i, r in enumerate( - [ - # pad ranges to number of images - r if 1 + r[-1] - r[0] >= num_images else list(r) + [r[-1]] * (num_images - len(r)) - # create range of frames in scene - for r in ( - range( - start.frame_num, - start.frame_num - + max( - 1, # guard against zero length scenes - end.frame_num - start.frame_num, - ), - ) - # for each scene in scene list - for start, end in scene_list - ) - ] - ) - ] + timecode_list = _generate_timecode_list(scene_list, num_images, frame_margin) image_filenames = {i: [] for i in range(len(timecode_list))} aspect_ratio = video.aspect_ratio diff --git a/tests/test_detectors.py b/tests/test_detectors.py index 0e5f4214..8db13213 100644 --- a/tests/test_detectors.py +++ b/tests/test_detectors.py @@ -224,3 +224,25 @@ def test_detectors_with_stats(test_video_file): scene_manager.detect_scenes(video=video, end_time=end_time) scene_list = scene_manager.get_scene_list() assert len(scene_list) == initial_scene_len + + +@pytest.mark.parametrize("detector_type", FAST_CUT_DETECTORS) +@pytest.mark.parametrize( + "min_scene_len", + # 30 frames at goldeneye.mp4's 24000/1001 (~23.976) fps is ~1.2513s. All four forms should + # produce identical cut lists, demonstrating that detectors accept temporal as well as + # frame-count values. + [30, 1.25, "1.25s", "00:00:01.250"], +) +def test_min_scene_len_accepts_time_values(detector_type, min_scene_len): + """Detectors accept min_scene_len as int (frames), float (seconds), or str (timecode).""" + test_case = TestCase( + path=get_absolute_path("resources/goldeneye.mp4"), + detector=detector_type(min_scene_len=min_scene_len), + start_time=1199, + end_time=1450, + scene_boundaries=[1199, 1260, 1334, 1365], + ) + scene_list = test_case.detect() + start_frames = [timecode.frame_num for timecode, _ in scene_list] + assert start_frames == test_case.scene_boundaries diff --git a/tests/test_output.py b/tests/test_output.py index db3f2307..3936f5e8 100644 --- a/tests/test_output.py +++ b/tests/test_output.py @@ -161,6 +161,38 @@ def test_save_images_singlethreaded(test_video_file, tmp_path: Path): assert total_images == len([path for path in tmp_path.glob(image_name_glob)]) +@pytest.mark.parametrize("frame_margin", [1, 0.1, "0.1s", "00:00:00.100"]) +def test_save_images_frame_margin_accepts_time_values( + test_video_file, tmp_path: Path, frame_margin +): + """save_images() should accept frame counts (int), seconds (float), and timecode strings.""" + video = VideoStreamCv2(test_video_file) + video_fps = video.frame_rate + scene_list = [ + (FrameTimecode(start, video_fps), FrameTimecode(end, video_fps)) + for start, end in [(0, 100), (200, 300)] + ] + image_filenames = save_images( + scene_list=scene_list, + output_dir=tmp_path, + video=video, + num_images=3, + image_extension="jpg", + image_name_template="scenedetect.tempfile.$SCENE_NUMBER.$IMAGE_NUMBER", + frame_margin=frame_margin, + ) + for paths in image_filenames.values(): + for path in paths: + assert tmp_path.joinpath(path).exists() + + +def test_save_images_rejects_negative_margin(test_video_file, tmp_path: Path): + video = VideoStreamCv2(test_video_file) + scene_list = [(FrameTimecode(0, video.frame_rate), FrameTimecode(10, video.frame_rate))] + with pytest.raises(ValueError): + save_images(scene_list=scene_list, output_dir=tmp_path, video=video, frame_margin=-1) + + # TODO: Test other functionality against zero width scenes. def test_save_images_zero_width_scene(test_video_file, tmp_path: Path): """Test scenedetect.scene_manager.save_images guards against zero width scenes.""" diff --git a/website/pages/changelog.md b/website/pages/changelog.md index 535e2d5a..3f4fa857 100644 --- a/website/pages/changelog.md +++ b/website/pages/changelog.md @@ -677,6 +677,7 @@ Although there have been minimal changes to most API examples, there are several - [feature] VFR videos are handled correctly by the OpenCV and PyAV backends, and should work correctly with default parameters - [feature] New `save-xml` command supports saving scenes in Final Cut Pro formats [#156](https://github.com/Breakthrough/PySceneDetect/issues/156) +- [feature] `--min-scene-len`/`-m` and `save-images --frame-margin`/`-m` now accept seconds (e.g. `0.6s`) and timecodes (e.g. `00:00:00.600`) in addition to a frame count [#531](https://github.com/Breakthrough/PySceneDetect/issues/531) - [bugfix] Fix floating-point precision error in `save-otio` output where frame values near integer boundaries (e.g. `90.00000000000001`) were serialized with spurious precision - [refactor] Remove deprecated `-d`/`--min-delta-hsv` option from `detect-adaptive` command @@ -702,6 +703,7 @@ Although there have been minimal changes to most API examples, there are several * Remove `SceneDetector.is_processing_required()` method * Remove `SceneDetector.stats_manager_required` property, no longer required * Remove deprecated `SparseSceneDetector` interface + * Detector `min_scene_len` and `save_images()` `frame_margin` arguments now accept seconds (`float`) and timecode strings (e.g. `"0.6s"`, `"00:00:00.600"`) in addition to a frame count (`int`); these are evaluated using the source video's timing for correct behavior on VFR videos [#531](https://github.com/Breakthrough/PySceneDetect/issues/531) **Module Reorganization:**