diff --git a/Lib/pickle.py b/Lib/pickle.py index 3e7cf25cb05337..a28e977317a268 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -162,6 +162,10 @@ def __init__(self, value): _tuplesize2code = [EMPTY_TUPLE, TUPLE1, TUPLE2, TUPLE3] +# Precomputed BININT1 opcode + payload for n in 0..255. Avoids the +# struct.pack("= 4 (the common case for any + # recent user), MEMOIZE is a one-byte constant; avoid the method + # dispatch + the redundant self.write indirection. + proto = self.proto + if proto >= 4: + self.write(MEMOIZE) + elif self.bin: + if idx < 256: + self.write(BINPUT + pack("= 4: return MEMOIZE @@ -560,7 +579,15 @@ def get(self, i): return GET + repr(i).encode("ascii") + b'\n' def save(self, obj, save_persistent_id=True): - self.framer.commit_frame() + # Inlined commit_frame() hot check. The frame is either None + # (proto < 4) or a BytesIO that only needs committing once it + # exceeds _FRAME_SIZE_TARGET. Skip the Python-level method + # dispatch for the no-op case (the overwhelming majority of + # saves on small/medium payloads). + framer = self.framer + cf = framer.current_frame + if cf is not None and cf.tell() >= _Framer._FRAME_SIZE_TARGET: + framer.commit_frame() # Check for persistent id (defined by a subclass) if save_persistent_id: @@ -569,7 +596,44 @@ def save(self, obj, save_persistent_id=True): self.save_pers(pid) return - # Check the memo + # Fast paths matching the order of Modules/_pickle.c::save(). + # Each of these returns without going through reducer_override, + # which the C reference implementation also skips for these + # types. + t = type(obj) + # str: memoized, so check memo inline before falling into save_str. + if t is str: + x = self.memo.get(id(obj)) + if x is not None: + self.write(self.get(x[0])) + return + self.save_str(obj) + return + # int / None / bool / float: not memoized; skip memo.get entirely. + # Placed before bytes so int-heavy workloads don't pay an extra + # branch miss before hitting their fast path. + if t is int: + self.save_long(obj) + return + if obj is None: + self.write(NONE) + return + if t is bool: + self.save_bool(obj) + return + if t is float: + self.save_float(obj) + return + # bytes: memoized; same inline memo pattern as str. + if t is bytes: + x = self.memo.get(id(obj)) + if x is not None: + self.write(self.get(x[0])) + return + self.save_bytes(obj) + return + + # Check the memo (non-atomic, non-str types) x = self.memo.get(id(obj)) if x is not None: self.write(self.get(x[0])) @@ -582,7 +646,6 @@ def save(self, obj, save_persistent_id=True): if rv is NotImplemented: # Check the type dispatch table - t = type(obj) f = self.dispatch.get(t) if f is not None: f(self, obj) # Call unbound method with explicit self @@ -827,7 +890,7 @@ def save_long(self, obj): # First one- and two-byte unsigned ints: if obj >= 0: if obj <= 0xff: - self.write(BININT1 + pack("= n: + return + remaining = n - idx + if remaining == 1: + try: + save(obj[idx]) + except BaseException as exc: + exc.add_note(f'when serializing {_T(obj)} item {idx}') + raise + write(APPEND) + return + batch = remaining if remaining < batch_size else batch_size + snapshot = obj[idx:idx + batch] + write(MARK) + i = idx + for x in snapshot: + try: + save(x) + except BaseException as exc: + exc.add_note(f'when serializing {_T(obj)} item {i}') + raise + i += 1 + write(APPENDS) + idx = i + def _batch_appends(self, items, obj): # Helper to batch up APPENDS sequences save = self.save @@ -1077,10 +1183,56 @@ def save_dict(self, obj): self.write(MARK + DICT) self.memoize(obj) - self._batch_setitems(obj.items(), obj) + if self.bin and type(obj) is dict: + self._batch_setitems_exact(obj) + else: + self._batch_setitems(obj.items(), obj) dispatch[dict] = save_dict + def _batch_setitems_exact(self, obj): + # Fast path for type(obj) is dict, binary protocols. dict's own + # iterator raises RuntimeError on size change, so no snapshotting + # is needed. + save = self.save + write = self.write + batch_size = self._BATCHSIZE + items = obj.items() + n = len(items) + if n == 0: + return + if n == 1: + for k, v in items: + save(k) + try: + save(v) + except BaseException as exc: + exc.add_note(f'when serializing {_T(obj)} item {k!r}') + raise + write(SETITEM) + return + if n <= batch_size: + # Single batch: iterate items() directly, no batching machinery. + # dict_items iteration itself raises RuntimeError on size change, + # so mutation during save() (e.g. from persistent_id hooks) is + # detected. + write(MARK) + for k, v in items: + save(k) + try: + save(v) + except BaseException as exc: + exc.add_note(f'when serializing {_T(obj)} item {k!r}') + raise + write(SETITEMS) + return + # Large dict: delegate to the generic path, which uses batched() + # over the live items iterator and preserves dict mutation-during- + # save detection. The per-batch tuple allocation is amortised over + # BATCHSIZE items here, so the exact-dict fast-path advantage is + # concentrated on the n <= batch_size case above. + self._batch_setitems(items, obj) + def _batch_setitems(self, items, obj): # Helper to batch up SETITEMS sequences; proto >= 1 only save = self.save diff --git a/Misc/NEWS.d/next/Library/2026-04-18-00-28-38.gh-issue-148706.mvlqm5.rst b/Misc/NEWS.d/next/Library/2026-04-18-00-28-38.gh-issue-148706.mvlqm5.rst new file mode 100644 index 00000000000000..1290074ee307b8 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-04-18-00-28-38.gh-issue-148706.mvlqm5.rst @@ -0,0 +1,16 @@ +Speed up :class:`pickle._Pickler` (the pure-Python pickler fallback, +also the base class of :class:`dill.Pickler`) on the ``dump`` path by +inlining the frame-boundary check, reordering +:meth:`~pickle._Pickler.save` to dispatch atomic types (``str``, +``bytes``, ``int``, ``None``, ``bool``, ``float``) ahead of the memo +lookup to match the C reference implementation in +:mod:`!Modules/_pickle.c`, adding exact-container fast paths for +:class:`list` and :class:`dict` under binary protocols, inlining +``MEMOIZE`` for protocol 4+, and precomputing the ``BININT1`` opcode +byte sequences for integers in ``0..255``. Pure-Python +:meth:`pickle._Pickler.dump` is now 20–49% faster on representative +workloads; :func:`dill.dumps` (which inherits from +:class:`pickle._Pickler`) is 19–37% faster on the same shapes. One +user-visible semantic change: atomic types no longer invoke +:meth:`~pickle._Pickler.reducer_override`, aligning pure-Python +behaviour with the long-standing C dispatch order. diff --git a/Misc/pickle-perf-data/README.md b/Misc/pickle-perf-data/README.md new file mode 100644 index 00000000000000..5068a0d4caa7bf --- /dev/null +++ b/Misc/pickle-perf-data/README.md @@ -0,0 +1,90 @@ +# Pickle Perf Raw Data + +Raw artifacts backing `Misc/pickle-perf-diary.md`. Regeneratable; +checked in so reviewers can re-verify numbers without rerunning the +methodology. + +## Harness + +`pickle_pure_bench.py` — the pure-Python `pickle._Pickler` / +`_Unpickler` benchmark used throughout. Five workloads (list-of-ints, +list-of-strs, flat str-keyed dict, deep list-of-lists, nested +list-of-dicts). Each reports a best-of-9 median for dump and load at +protocol 5. + +`pickle_pure_bench_bytes.py` — bytes-heavy workload (short bytes, +medium bytes, bytearrays, bytes-keyed dict). Introduced in round 2 to +evaluate F6 (bytes in the save() fast path). + +`pickle_save_profile.py` — `cProfile`-based breakdown used to identify +which internal calls dominate `save()` (informed the priority ordering +of ideas D, E over B; in round 2, drove the F1 / F2 / F4 ordering). + +Run each with `taskset -c 0 ./python