Initial commit: Anthropic API and MITM proxy to WaybackProxy

This commit is contained in:
2026-03-18 01:27:44 -04:00
commit 8390c34d30
14 changed files with 1360 additions and 0 deletions

116
mitm/addon.py Normal file
View File

@@ -0,0 +1,116 @@
"""
mitmproxy addon for the Time Travel Simulation.
Routing logic:
- Requests to api.anthropic.com → pass through to real internet
- All other requests → rewrite to HTTP and forward to WaybackProxy
The key challenge: WaybackProxy is an HTTP-only proxy. When a client
requests https://www.bbc.com, mitmproxy terminates TLS, but we need
to send WaybackProxy an HTTP request for http://www.bbc.com.
We reconstruct the URL as http:// and send it as an explicit proxy
request to WaybackProxy on port 8888.
"""
import re
from mitmproxy import http, ctx
# Domains that bypass WaybackProxy and go to the real internet
PASSTHROUGH_DOMAINS = {
"api.anthropic.com",
"mitm.it", # mitmproxy's own cert distribution page
}
# Patterns to scrub from response bodies
SCRUB_PATTERNS = [
(re.compile(r'<!--\s*BEGIN WAYBACK TOOLBAR.*?END WAYBACK TOOLBAR\s*-->', re.DOTALL | re.IGNORECASE), ''),
# Primary: strip archive.org URL prefixes — [^/]+/ matches any timestamp+modifier
(re.compile(r'https?://web\.archive\.org/web/[^/]+/', re.IGNORECASE), ''),
# Fallback: catch any remaining archive.org URLs
(re.compile(r'https?://web\.archive\.org[^\s"\'<>)]*', re.IGNORECASE), ''),
# Wayback-injected scripts
(re.compile(r'<script[^>]*(?:archive\.org|wayback)[^>]*>.*?</script>', re.DOTALL | re.IGNORECASE), ''),
# Wayback-injected CSS
(re.compile(r'<link[^>]*(?:archive\.org|wayback)[^>]*/?>', re.IGNORECASE), ''),
# Wombat rewriting engine
(re.compile(r'(?:var\s+)?_?wbhack[^;]*;', re.IGNORECASE), ''),
(re.compile(r'WB_wombat_Init\([^)]*\);?', re.IGNORECASE), ''),
# Archive meta tags
(re.compile(r'<meta[^>]*archive\.org[^>]*/?>', re.IGNORECASE), ''),
# Data attributes
(re.compile(r'\s*data-(?:wayback|archive)[^=]*="[^"]*"', re.IGNORECASE), ''),
# Any remaining text references
(re.compile(r'archive\.org', re.IGNORECASE), ''),
]
SCRUB_HEADERS = [
"x-archive-orig-",
"x-archive-",
"x-wayback-",
]
class TimeTravelRouter:
def request(self, flow: http.HTTPFlow) -> None:
original_host = flow.request.pretty_host
if original_host in PASSTHROUGH_DOMAINS:
ctx.log.info(f"[PASSTHROUGH] {flow.request.method} {flow.request.pretty_url}")
return
# Build the original HTTP URL (downgrade HTTPS → HTTP)
# This is what WaybackProxy needs to look up in the archive
original_path = flow.request.path # includes query string
http_url = f"http://{original_host}{original_path}"
ctx.log.info(f"[WAYBACK] {flow.request.pretty_url}{http_url}")
# Rewrite the request to go to WaybackProxy as an explicit HTTP proxy request
# In explicit proxy mode, the request line contains the full URL
flow.request.scheme = "http"
flow.request.host = "172.30.0.3"
flow.request.port = 8888
# Critical: set the URL that appears in the HTTP request line
# WaybackProxy reads this to know what archived page to fetch
flow.request.url = http_url
# Ensure the Host header matches the original site
flow.request.headers["Host"] = original_host
def response(self, flow: http.HTTPFlow) -> None:
# Don't scrub passthrough responses
original_host = flow.request.headers.get("Host", flow.request.pretty_host)
if original_host in PASSTHROUGH_DOMAINS:
return
# Scrub headers that might reveal archive.org
headers_to_remove = []
for header_name in flow.response.headers:
for prefix in SCRUB_HEADERS:
if header_name.lower().startswith(prefix):
headers_to_remove.append(header_name)
break
for h in headers_to_remove:
del flow.response.headers[h]
# Replace server header if it mentions archive infrastructure
server_header = flow.response.headers.get("server", "")
if "archive" in server_header.lower() or "wayback" in server_header.lower():
flow.response.headers["server"] = "Apache/2.2.15"
# Scrub response body for text content
content_type = flow.response.headers.get("content-type", "")
if any(t in content_type for t in ["text/html", "text/css", "javascript", "application/json"]):
try:
body = flow.response.get_text()
if body:
for pattern, replacement in SCRUB_PATTERNS:
body = pattern.sub(replacement, body)
flow.response.set_text(body)
except Exception as e:
ctx.log.warn(f"[SCRUB] Failed to scrub response: {e}")
addons = [TimeTravelRouter()]

15
mitm/entrypoint.sh Normal file
View File

@@ -0,0 +1,15 @@
#!/bin/sh
set -e
# Ensure cert directories exist and are writable
mkdir -p /home/mitmproxy/.mitmproxy
echo "Cert directory ready: $(ls -la /home/mitmproxy/)"
echo "Starting mitmproxy with time-travel addon..."
exec mitmdump \
--listen-host 0.0.0.0 \
--listen-port 8080 \
--set confdir=/home/mitmproxy/.mitmproxy \
--set ssl_insecure=true \
-s /opt/addon.py \
--showhost