# Copyright Kevin Deldycke <kevin@deldycke.com> and contributors.
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""Expose package-wide elements."""
from __future__ import annotations
__version__ = "8.0.0"
HASH_HEADERS: tuple[str, ...] = (
"Date",
"From",
"To",
# "CC",
# "BCC",
# "Reply-To",
"Subject",
"MIME-Version",
"Content-Type",
"Content-Disposition",
"User-Agent",
"X-Priority",
"Message-ID",
)
"""Default ordered list of headers to use to compute the unique hash of a mail.
By default we choose to exclude:
``CC``
Since ``mailman`` apparently `sometimes trims list members
<https://mail.python.org/pipermail/mailman-developers/2002-September/013233.html>`_
from the ``CC`` header to avoid sending duplicates. Which means that copies of mail
reflected back from the list server will have a different ``CC`` to the copy saved by
the MUA at send-time.
``BCC``
Because copies of the mail saved by the MUA at send-time will have ``BCC``, but copies
reflected back from the list server won't.
``Reply-To``
Since a mail could be ``CC``'d to two lists with different ``Reply-To`` munging
options set.
"""
ADDRESS_HEADERS = frozenset([
"from",
"to",
"cc",
"bcc",
"reply-to",
"sender",
"return-path",
"resent-from",
"resent-to",
"resent-cc",
"resent-bcc",
"resent-reply-to",
"resent-sender",
"delivered-to",
"x-original-to",
"envelope-to",
"x-envelope-from",
"x-envelope-to",
"disposition-notification-to",
"original-recipient",
])
"""Headers that contain email addresses.
.. danger::
These IDs should be kept lower-case, because they are compared to the one provided
to those provided to the ``-h``/``--hash-header`` option, that is carried by the
``hash_headers`` property of the configuration.
"""
QUOTE_DISCARD_HEADERS = ADDRESS_HEADERS
"""Headers from which quotes should be discarded.
E.g. ``"Bob" <bob@example.com>`` should hash to the same thing as
``Bob <bob@example.com>``.
"""
MINIMAL_HEADERS_COUNT = 4
"""Below this value, we consider not having enough headers to compute a solid hash."""
DEFAULT_SIZE_THRESHOLD = 512
"""Default size threshold in bytes.
Since we're ignoring the ``Content-Length`` header by default `because of mailing-list
effects <https://kdeldycke.github.io/mail-deduplicate/design.html#mailing-lists>`_, we
introduced a limit on the allowed difference between the sizes of the message payloads.
If this is exceeded, a warning is issued and the messages are not considered duplicates,
because this could point to message corruption somewhere, or a false positive.
.. note::
Headers are not counted towards this threshold, because many `headers can be added
by mailing list software
<https://kdeldycke.github.io/mail-deduplicate/design.html#mailing-lists>`_ such as
``mailman``, or even by the process of sending the mail through various MTAs.
One copy could have been stored by the sender's MUA prior to sending, without any
``Received`` headers, and another copy could be reflected back via a ``CC``-to-self
mechanism or mailing list server.
This threshold has to be large enough to allow for footers added by mailing list
servers.
"""
DEFAULT_CONTENT_THRESHOLD = 768
"""Default content threshold in bytes.
As above, we similarly generates unified diffs of duplicates and ensure that the diff is
not greater than a certain size to limit false-positives.
"""
[docs]
class SizeDiffAboveThreshold(Exception):
"""Difference in mail size is greater than `threshold.
<https://kdeldycke.github.io/mail-deduplicate/mail_deduplicate.html#mail_deduplicate.DEFAULT_SIZE_THRESHOLD>`_.
"""
[docs]
class ContentDiffAboveThreshold(Exception):
"""Difference in mail content is greater than `threshold.
<https://kdeldycke.github.io/mail-deduplicate/mail_deduplicate.html#mail_deduplicate.DEFAULT_CONTENT_THRESHOLD>`_.
"""