minions-ai-agents/tests/test_dlp.py

"""
Tests for DLP Filter (Data Loss Prevention).

Validates regex patterns for sensitive data redaction.
"""

import pytest

from src.security.dlp_filter import DLPFilter, sanitize_text


class TestDLPFilter:
    """Tests for DLP Filter functionality."""

    @pytest.fixture
    def dlp(self):
        """Create a fresh DLP filter for each test."""
        return DLPFilter()

    def test_password_redaction_simple(self, dlp):
        """Test simple password redaction."""
        text = "My password is: secret123"
        result = dlp.sanitize(text)

        # The sensitive value should be removed
        assert "secret123" not in result
        # Some form of redaction should be present
        assert "REDACTED" in result or "password" in result.lower()

    def test_password_redaction_various_formats(self, dlp):
        """Test password redaction in various formats."""
        cases = [
            ("password=mypass123", "mypass123"),
            ("senha: minhasenha456", "minhasenha456"),
            ("pwd: abc123def", "abc123def"),
            ('secret="topsecret789"', "topsecret789"),
        ]

        for text, sensitive_value in cases:
            result = dlp.sanitize(text)
            # Original sensitive value should be gone
            assert sensitive_value not in result, f"Value '{sensitive_value}' still in result"

    def test_api_key_redaction(self, dlp):
        """Test API key redaction."""
        text = "api_key=sk-proj-1234567890abcdefghij"
        result = dlp.sanitize(text)

        assert "sk-proj-1234567890abcdefghij" not in result
        assert "[REDACTED]" in result

    def test_cpf_redaction(self, dlp):
        """Test Brazilian CPF redaction."""
        cases = [
            "CPF: 123.456.789-00",
            "cpf=12345678900",
            "O CPF 123.456.789-00 está cadastrado",
        ]

        for text in cases:
            result = dlp.sanitize(text)
            assert "[CPF_REDACTED]" in result

    def test_cnpj_redaction(self, dlp):
        """Test Brazilian CNPJ redaction."""
        cases = [
            "CNPJ: 12.345.678/0001-90",
            "cnpj=12345678000190",
        ]

        for text in cases:
            result = dlp.sanitize(text)
            assert "[CNPJ_REDACTED]" in result

    def test_credit_card_redaction(self, dlp):
        """Test credit card number redaction."""
        cases = [
            "Card: 4111-1111-1111-1111",
            "Number: 4111 1111 1111 1111",
            "Cartão: 4111111111111111",
        ]

        for text in cases:
            result = dlp.sanitize(text)
            assert "[CARD_REDACTED]" in result

    def test_email_partial_redaction(self, dlp):
        """Test partial email redaction (keep domain)."""
        text = "Contact joao.silva@empresa.com.br for help"
        result = dlp.sanitize(text)

        assert "joao.silva" not in result
        assert "empresa.com.br" in result  # Domain kept
        assert "[USER]@empresa.com.br" in result

    def test_private_key_redaction(self, dlp):
        """Test SSH private key redaction."""
        text = """
        -----BEGIN RSA PRIVATE KEY-----
        MIIEpAIBAAKCAQEA0Z...
        -----END RSA PRIVATE KEY-----
        """
        result = dlp.sanitize(text)

        assert "MIIEpAIBAAKCAQEA0Z" not in result
        assert "[KEY_REDACTED]" in result

    def test_normal_text_unchanged(self, dlp):
        """Test that normal text is not modified."""
        text = "Hello, this is a normal support message about server performance."
        result = dlp.sanitize(text)

        assert result == text

    def test_mixed_content(self, dlp):
        """Test text with both sensitive and normal content."""
        text = """
        Olá, preciso de ajuda com o servidor srv-app01.
        senha: admin123
        O CPF do responsável é 123.456.789-00
        Por favor, me ajudem!
        """
        result = dlp.sanitize(text)

        # Normal content preserved
        assert "srv-app01" in result
        assert "Por favor" in result

        # Sensitive content redacted
        assert "admin123" not in result
        assert "[CPF_REDACTED]" in result

    def test_statistics_tracking(self, dlp):
        """Test that statistics are tracked correctly."""
        dlp.sanitize("password=test123")
        dlp.sanitize("CPF: 123.456.789-00")
        dlp.sanitize("Normal text")

        stats = dlp.get_stats()

        assert stats["total_processed"] == 3
        assert stats["total_redacted"] >= 2

    def test_convenience_function(self):
        """Test the sanitize_text convenience function."""
        result = sanitize_text("password=secret")

        assert "secret" not in result
        assert "[REDACTED]" in result

    def test_empty_input(self, dlp):
        """Test handling of empty input."""
        assert dlp.sanitize("") == ""
        assert dlp.sanitize(None) is None