Source code for mail_deduplicate

# Copyright Kevin Deldycke <> and contributors.
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
"""Expose package-wide elements."""

from __future__ import annotations

from pathlib import Path

from boltons.iterutils import unique

__version__ = "7.3.1"

HASH_HEADERS: tuple[str, ...] = (
    # 'Cc',
    # 'Bcc',
    # 'Reply-To',
"""Default ordered list of headers to use to compute the unique hash of a mail.

By default we choose to exclude:

  Since ``mailman`` apparently `sometimes trims list members
  from the ``Cc`` header to avoid sending duplicates. Which means that copies of mail
  reflected back from the list server will have a different ``Cc`` to the copy saved by
  the MUA at send-time.

  Because copies of the mail saved by the MUA at send-time will have ``Bcc``, but copies
  reflected back from the list server won't.

  Since a mail could be ``Cc``'d to two lists with different ``Reply-To`` munging
  options set.

"""Below this value, we consider not having enough headers to compute a solid hash."""

"""Default size threshold in bytes.

Since we're ignoring the ``Content-Length`` header by default `because of mailing-list
effects <>`_, we
introduced a limit on the allowed difference between the sizes of the message payloads.

If this is exceeded, a warning is issued and the messages are not considered duplicates,
because this could point to message corruption somewhere, or a false positive.

.. note::
    Headers are not counted towards this threshold, because many `headers can be added
    by mailing list software
    <>`_ such as
    ``mailman``, or even by the process of sending the mail through various MTAs.

    One copy could have been stored by the sender's MUA prior to sending, without any
    ``Received:`` headers, and another copy could be reflected back via a ``Cc``-to-self
    mechanism or mailing list server.

    This threshold has to be large enough to allow for footers added by mailing list

"""Default content threshold in bytes.

As above, we similarly generates unified diffs of duplicates and ensure that the diff is
not greater than a certain size to limit false-positives.

DATE_HEADER = "date-header"
CTIME = "ctime"
"""Methods used to extract a mail's canonical timestamp:

- ``date-header``: sourced from the message's ``Date`` header.
- ``ctime``: sourced from the email's file from the filesystem. Only available for
  ``maildir`` sources.

Also see:

[docs]class TooFewHeaders(Exception): """Not enough headers were found to produce a solid hash."""
[docs]class SizeDiffAboveThreshold(Exception): """Difference in mail size is greater than `threshold. < deduplicate/mail_deduplicate.html#mail_deduplicate.DEFAULT_SIZE_THRESHOLD>`_. """
[docs]class ContentDiffAboveThreshold(Exception): """Difference in mail content is greater than `threshold. < deduplicate/mail_deduplicate.html#mail_deduplicate.DEFAULT_CONTENT_THRESHOLD>`_. """
[docs]class Config: """Holds global configuration.""" # Keep these defaults in sync with CLI option definitions. default_conf = { "dry_run": False, "input_format": False, "force_unlock": False, "hash_headers": HASH_HEADERS, "hash_body": None, "hash_only": False, "size_threshold": DEFAULT_SIZE_THRESHOLD, "content_threshold": DEFAULT_CONTENT_THRESHOLD, "show_diff": False, "strategy": None, "time_source": None, "regexp": None, "action": None, "export": None, "export_format": "mbox", "export_append": False, } def __init__(self, **kwargs) -> None: """Validates configuration parameter types and values.""" # Load default values. self.conf = self.default_conf.copy() unrecognized_options = set(kwargs) - set(self.default_conf) if unrecognized_options: msg = f"Unrecognized {unrecognized_options} options." raise ValueError(msg) # Replace defaults values with our config. self.conf.update(kwargs) # Check thresholds. assert self.size_threshold >= -1 assert self.content_threshold >= -1 # Headers are case-insensitive in Python implementation. normalized_headers = (h.lower() for h in self.hash_headers) # type: ignore[has-type] # Remove duplicate entries. normalized_headers = unique(normalized_headers) # Mail headers are composed of ASCII characters between 33 and 126 # (both inclusive) according the RFC-5322. for hid in normalized_headers: ascii_indexes = set(map(ord, hid)) assert max(ascii_indexes) <= 126 assert min(ascii_indexes) >= 33 self.hash_headers = tuple(normalized_headers) # Export mail box will always be created from scratch and is not # expected to exists in the first place. if self.export: # type: ignore[has-type] self.export = Path(self.export).resolve() # type: ignore[has-type] if self.export.exists() and self.export_append is not True: raise FileExistsError(self.export) def __getattr__(self, attr_id): """Expose configuration entries as properties.""" if attr_id in self.conf: return self.conf[attr_id] return None