Source code for tests.test_pygments

# Copyright Kevin Deldycke <kevin@deldycke.com> and contributors.
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

from __future__ import annotations

import sys
import tarfile
from importlib import metadata
from operator import itemgetter
from pathlib import Path

import pytest
import requests
from boltons.strutils import camel2under
from boltons.typeutils import issubclass
from pygments.filter import Filter
from pygments.filters import get_filter_by_name
from pygments.formatter import Formatter
from pygments.formatters import get_formatter_by_name
from pygments.lexer import Lexer
from pygments.lexers import find_lexer_class_by_name, get_lexer_by_name

from click_extra import pygments as extra_pygments
from click_extra.pygments import DEFAULT_TOKEN_TYPE, collect_session_lexers

if sys.version_info >= (3, 11):
    import tomllib
else:
    import tomli as tomllib  # type: ignore[import-not-found]


PROJECT_ROOT = Path(__file__).parent.parent



[docs]
@pytest.mark.once
def test_ansi_lexers_candidates(tmp_path):
    """Look into Pygments test suite to find all ANSI lexers candidates.

    Good candidates for ANSI colorization are lexers that are producing
    ``Generic.Output`` tokens, which are often used by REPL-like and scripting
    terminal to render text in a console.

    The list is manually maintained in Click Extra code, and this test is here to
    detect new candidates from new releases of Pygments.

    .. attention::
        The Pygments source code is downloaded from GitHub in the form of an archive,
        and extracted in a temporary folder.

        The version of Pygments used for this test is the one installed in the current
        environment.

    .. danger:: Security check
        While extracting the archive, we double check we are not fed an archive
        exploiting relative ``..`` or ``.`` path attacks.
    """
    version = metadata.version("pygments")

    source_url = (
        f"https://github.com/pygments/pygments/archive/refs/tags/{version}.tar.gz"
    )
    base_folder = f"pygments-{version}"
    archive_path = tmp_path / f"{base_folder}.tar.gz"

    # Download the source distribution from GitHub.
    with requests.get(source_url) as response:
        assert response.ok
        archive_path.write_bytes(response.content)

    assert archive_path.exists()
    assert archive_path.is_file()
    assert archive_path.stat().st_size > 0

    # Locations of lexer artifacts in test suite.
    parser_token_traces = {
        str(tmp_path / base_folder / "tests" / "examplefiles" / "*" / "*.output"),
        str(tmp_path / base_folder / "tests" / "snippets" / "*" / "*.txt"),
    }

    # Browse the downloaded package to find the test suite, and inspect the
    # traces of parsed tokens used as gold master for lexers tests.
    lexer_candidates = set()
    with tarfile.open(archive_path, "r:gz") as tar:
        for member in tar.getmembers():
            # Skip non-test files.
            if not member.isfile():
                continue

            # XXX Security check of relative ``..`` or ``.`` path attacks.
            filename = tmp_path.joinpath(member.name).resolve()
            assert filename.is_relative_to(tmp_path)

            # Skip files that are not part of the test suite data.
            match = False
            for pattern in parser_token_traces:
                if filename.match(pattern):
                    match = True
                    break
            if not match:
                continue

            file = tar.extractfile(member)
            # Skip empty files.
            if not file:
                continue

            content = file.read().decode("utf-8")

            # Skip lexers that are rendering generic, terminal-like output tokens.
            if f" {'.'.join(DEFAULT_TOKEN_TYPE)}\n" not in content:
                continue

            # Extarct lexer alias from the test file path.
            lexer_candidates.add(filename.parent.name)

    assert lexer_candidates
    lexer_classes = {find_lexer_class_by_name(alias) for alias in lexer_candidates}
    # We cannot test for strict equality yet, as some ANSI-ready lexers do not
    # have any test artifacts producing ``Generic.Output`` tokens.
    assert lexer_classes <= set(collect_session_lexers())




[docs]
def collect_classes(klass, prefix="Ansi"):
    """Returns all classes defined in ``click_extra.pygments`` that are a subclass of
    ``klass``, and whose name starts with the provided ``prefix``."""
    return {
        name: var
        for name, var in extra_pygments.__dict__.items()
        if issubclass(var, klass) and name.startswith(prefix)
    }




[docs]
def get_pyproject_section(*section_path: str) -> dict[str, str]:
    """Descends into the TOML tree of ``pyproject.toml`` to reach the value specified by
    ``section_path``."""
    toml_path = PROJECT_ROOT.joinpath("pyproject.toml").resolve()
    section: dict = tomllib.loads(toml_path.read_text(encoding="utf-8"))
    for section_id in section_path:
        section = section[section_id]
    return section




[docs]
def check_entry_points(entry_points: dict[str, str], *section_path: str) -> None:
    entry_points = dict(sorted(entry_points.items(), key=itemgetter(0)))
    project_entry_points = get_pyproject_section(*section_path)
    assert project_entry_points == entry_points




[docs]
@pytest.mark.once
def test_formatter_entry_points():
    entry_points = {}
    for name in collect_classes(Formatter):
        entry_id = camel2under(name).replace("_", "-")
        entry_points[entry_id] = f"click_extra.pygments:{name}"

    check_entry_points(entry_points, "project", "entry-points", "pygments.formatters")




[docs]
@pytest.mark.once
def test_filter_entry_points():
    entry_points = {}
    for name in collect_classes(Filter):
        entry_id = camel2under(name).replace("_", "-")
        entry_points[entry_id] = f"click_extra.pygments:{name}"

    check_entry_points(entry_points, "project", "entry-points", "pygments.filters")




[docs]
@pytest.mark.once
def test_lexer_entry_points():
    entry_points = {}
    for lexer in collect_session_lexers():
        # Check an ANSI lexer variant is available for import from Click Extra.
        ansi_lexer_id = f"Ansi{lexer.__name__}"
        assert ansi_lexer_id in extra_pygments.__dict__

        # Transform ANSI lexer class ID into entry point ID.
        entry_id = "-".join(
            w for w in camel2under(ansi_lexer_id).split("_") if w != "lexer"
        )

        # Generate the lexer entry point.
        class_path = f"click_extra.pygments:{ansi_lexer_id}"
        entry_points[entry_id] = class_path

    check_entry_points(entry_points, "project", "entry-points", "pygments.lexers")




[docs]
@pytest.mark.once
def test_registered_formatters():
    for klass in collect_classes(Formatter).values():
        for alias in klass.aliases:
            get_formatter_by_name(alias)




[docs]
@pytest.mark.once
def test_registered_filters():
    for name in collect_classes(Filter):
        entry_id = camel2under(name).replace("_", "-")
        get_filter_by_name(entry_id)




[docs]
@pytest.mark.once
def test_registered_lexers():
    for klass in collect_classes(Lexer).values():
        for alias in klass.aliases:
            get_lexer_by_name(alias)




[docs]
@pytest.mark.once
def test_ansi_lexers_doc():
    doc_content = PROJECT_ROOT.joinpath("docs/pygments.md").read_text(encoding="utf-8")
    for lexer in collect_session_lexers():
        assert lexer.__name__ in doc_content