Source code for mail_deduplicate.strategy
# Copyright Kevin Deldycke <kevin@deldycke.com> and contributors.
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""Strategy definitions."""
from __future__ import annotations
import enum
import logging
import random
import re
from click_extra.colorize import default_theme as theme
TYPE_CHECKING = False
if TYPE_CHECKING:
from typing import Callable
from .deduplicate import DuplicateSet
from .mail import DedupMailMixin
[docs]
def select_older(duplicates: DuplicateSet) -> set[DedupMailMixin]:
"""Select all older duplicates.
Discards the newests, i.e. the subset sharing the most recent timestamp.
"""
logging.info(
f"Select all mails strictly older than the {duplicates.newest_timestamp} "
"timestamp...",
)
return {
mail for mail in duplicates.pool if mail.timestamp < duplicates.newest_timestamp
}
[docs]
def select_oldest(duplicates: DuplicateSet) -> set[DedupMailMixin]:
"""Select all the oldest duplicates.
Discards the newers, i.e. all mail of the duplicate set but those sharing the oldest
timestamp.
"""
logging.info(
f"Select all mails sharing the oldest {duplicates.oldest_timestamp} "
"timestamp...",
)
return {
mail
for mail in duplicates.pool
if mail.timestamp == duplicates.oldest_timestamp
}
[docs]
def select_newer(duplicates: DuplicateSet) -> set[DedupMailMixin]:
"""Select all newer duplicates.
Discards the oldest, i.e. the subset sharing the most ancient timestamp.
"""
logging.info(
f"Select all mails strictly newer than the {duplicates.oldest_timestamp} "
"timestamp...",
)
return {
mail for mail in duplicates.pool if mail.timestamp > duplicates.oldest_timestamp
}
[docs]
def select_newest(duplicates: DuplicateSet) -> set[DedupMailMixin]:
"""Select all the newest duplicates.
Discards the olders, i.e. all mail of the duplicate set but those sharing the newest
timestamp.
"""
logging.info(
f"Select all mails sharing the newest {duplicates.newest_timestamp} "
"timestamp...",
)
return {
mail
for mail in duplicates.pool
if mail.timestamp == duplicates.newest_timestamp
}
[docs]
def select_smaller(duplicates: DuplicateSet) -> set[DedupMailMixin]:
"""Select all smaller duplicates.
Discards the biggests, i.e. the subset sharing the biggest size.
"""
logging.info(
f"Select all mails strictly smaller than {duplicates.biggest_size} bytes...",
)
return {mail for mail in duplicates.pool if mail.size < duplicates.biggest_size}
[docs]
def select_smallest(duplicates: DuplicateSet) -> set[DedupMailMixin]:
"""Select all the smallest duplicates.
Discards the biggers. i.e. all mail of the duplicate set but those sharing the
smallest size.
"""
logging.info(
f"Select all mails sharing the smallest size of {duplicates.smallest_size} "
"bytes...",
)
return {mail for mail in duplicates.pool if mail.size == duplicates.smallest_size}
[docs]
def select_bigger(duplicates: DuplicateSet) -> set[DedupMailMixin]:
"""Select all bigger duplicates.
Discards the smallests, i.e. the subset sharing the smallest size.
"""
logging.info(
f"Select all mails strictly bigger than {duplicates.smallest_size} bytes...",
)
return {mail for mail in duplicates.pool if mail.size > duplicates.smallest_size}
[docs]
def select_biggest(duplicates: DuplicateSet) -> set[DedupMailMixin]:
"""Select all the biggest duplicates.
Discards the smallers, i.e. all mail of the duplicate set but those sharing the
biggest size.
"""
logging.info(
f"Select all mails sharing the biggest size of {duplicates.biggest_size} "
"bytes...",
)
return {mail for mail in duplicates.pool if mail.size == duplicates.biggest_size}
[docs]
def select_matching_path(duplicates: DuplicateSet) -> set[DedupMailMixin]:
"""Select all duplicates whose file path match the regular expression provided via
the --regexp parameter."""
assert duplicates.conf["regexp"] is not None
logging.info(
"Select all mails with file path matching the "
f"{duplicates.conf['regexp'].pattern} regexp...",
)
return {
mail
for mail in duplicates.pool
if re.search(duplicates.conf["regexp"], mail.path)
}
[docs]
def select_non_matching_path(duplicates: DuplicateSet) -> set[DedupMailMixin]:
"""Select all duplicates whose file path doesn't match the regular expression
provided via the --regexp parameter."""
assert duplicates.conf["regexp"] is not None
logging.info(
"Select all mails with file path not matching the "
f"{duplicates.conf['regexp'].pattern} regexp...",
)
return {
mail
for mail in duplicates.pool
if not re.search(duplicates.conf["regexp"], mail.path)
}
[docs]
def select_one(duplicates: DuplicateSet) -> set[DedupMailMixin]:
"""Randomly select one duplicate, and discards all others."""
return {random.choice(tuple(duplicates.pool))}
[docs]
def select_all_but_one(duplicates: DuplicateSet) -> set[DedupMailMixin]:
"""Randomly discard one duplicate, and select all others."""
return set(random.sample(tuple(duplicates.pool), k=len(duplicates.pool) - 1))
[docs]
@enum.unique
class Strategy(enum.Enum):
"""Selection strategies to apply on a sets of duplicate mails.
Each strategy in the ``Enum`` points to the function implementing the selection
logic, by the way of the ``strategy_function()`` method.
Strategies whose member value is a string are simply aliases to other strategies,
pointing to the name of the function implementing the logic. The other members have
integer values, to indicate their function ID is to be derived from the member name.
This arrangement allow for each member to have its own existence without being
hidden by the aliasing mechanism of ``Enum``.
Aliases are great usability features to represent inverse operations. They helps
users to better reason about the selection operators depending on their mental
models.
"""
# Time-based strategies.
SELECT_OLDER = 1
SELECT_OLDEST = 2
SELECT_NEWER = 3
SELECT_NEWEST = 4
DISCARD_NEWEST = "select_older"
DISCARD_NEWER = "select_oldest"
DISCARD_OLDEST = "select_newer"
DISCARD_OLDER = "select_newest"
# Size-based strategies.
SELECT_SMALLER = 5
SELECT_SMALLEST = 6
SELECT_BIGGER = 7
SELECT_BIGGEST = 8
DISCARD_BIGGEST = "select_smaller"
DISCARD_BIGGER = "select_smallest"
DISCARD_SMALLEST = "select_bigger"
DISCARD_SMALLER = "select_biggest"
# Location-based strategies.
SELECT_MATCHING_PATH = 9
SELECT_NON_MATCHING_PATH = 10
DISCARD_NON_MATCHING_PATH = "select_matching_path"
DISCARD_MATCHING_PATH = "select_non_matching_path"
# Quantity-based strategies.
SELECT_ONE = 11
SELECT_ALL_BUT_ONE = 12
DISCARD_ALL_BUT_ONE = "select_one"
DISCARD_ONE = "select_all_but_one"
def __str__(self):
"""Get the string to be used in CLI for the strategy."""
return self.name.lower().replace("_", "-")
@property
def strategy_function(self) -> Callable:
"""Return the function's ID is the value of the ``Enum`` member."""
if isinstance(self.value, str):
func_id = self.value
else:
func_id = self.name.lower()
return globals()[func_id] # type: ignore[no-any-return]
[docs]
def apply_strategy(self, duplicates: DuplicateSet) -> set[DedupMailMixin]:
"""Perform the selection strategy on the provided duplicate set.
Returns a set of selected mails objects.
"""
logging.info(f"Apply {theme.choice(str(self))} strategy...")
return set(self.strategy_function(duplicates))