Initial commit: Anthropic API and MITM proxy to WaybackProxy
This commit is contained in:
116
mitm/addon.py
Normal file
116
mitm/addon.py
Normal file
@@ -0,0 +1,116 @@
|
||||
"""
|
||||
mitmproxy addon for the Time Travel Simulation.
|
||||
|
||||
Routing logic:
|
||||
- Requests to api.anthropic.com → pass through to real internet
|
||||
- All other requests → rewrite to HTTP and forward to WaybackProxy
|
||||
|
||||
The key challenge: WaybackProxy is an HTTP-only proxy. When a client
|
||||
requests https://www.bbc.com, mitmproxy terminates TLS, but we need
|
||||
to send WaybackProxy an HTTP request for http://www.bbc.com.
|
||||
We reconstruct the URL as http:// and send it as an explicit proxy
|
||||
request to WaybackProxy on port 8888.
|
||||
"""
|
||||
|
||||
import re
|
||||
from mitmproxy import http, ctx
|
||||
|
||||
# Domains that bypass WaybackProxy and go to the real internet
|
||||
PASSTHROUGH_DOMAINS = {
|
||||
"api.anthropic.com",
|
||||
"mitm.it", # mitmproxy's own cert distribution page
|
||||
}
|
||||
|
||||
# Patterns to scrub from response bodies
|
||||
SCRUB_PATTERNS = [
|
||||
(re.compile(r'<!--\s*BEGIN WAYBACK TOOLBAR.*?END WAYBACK TOOLBAR\s*-->', re.DOTALL | re.IGNORECASE), ''),
|
||||
# Primary: strip archive.org URL prefixes — [^/]+/ matches any timestamp+modifier
|
||||
(re.compile(r'https?://web\.archive\.org/web/[^/]+/', re.IGNORECASE), ''),
|
||||
# Fallback: catch any remaining archive.org URLs
|
||||
(re.compile(r'https?://web\.archive\.org[^\s"\'<>)]*', re.IGNORECASE), ''),
|
||||
# Wayback-injected scripts
|
||||
(re.compile(r'<script[^>]*(?:archive\.org|wayback)[^>]*>.*?</script>', re.DOTALL | re.IGNORECASE), ''),
|
||||
# Wayback-injected CSS
|
||||
(re.compile(r'<link[^>]*(?:archive\.org|wayback)[^>]*/?>', re.IGNORECASE), ''),
|
||||
# Wombat rewriting engine
|
||||
(re.compile(r'(?:var\s+)?_?wbhack[^;]*;', re.IGNORECASE), ''),
|
||||
(re.compile(r'WB_wombat_Init\([^)]*\);?', re.IGNORECASE), ''),
|
||||
# Archive meta tags
|
||||
(re.compile(r'<meta[^>]*archive\.org[^>]*/?>', re.IGNORECASE), ''),
|
||||
# Data attributes
|
||||
(re.compile(r'\s*data-(?:wayback|archive)[^=]*="[^"]*"', re.IGNORECASE), ''),
|
||||
# Any remaining text references
|
||||
(re.compile(r'archive\.org', re.IGNORECASE), ''),
|
||||
]
|
||||
|
||||
SCRUB_HEADERS = [
|
||||
"x-archive-orig-",
|
||||
"x-archive-",
|
||||
"x-wayback-",
|
||||
]
|
||||
|
||||
|
||||
class TimeTravelRouter:
|
||||
def request(self, flow: http.HTTPFlow) -> None:
|
||||
original_host = flow.request.pretty_host
|
||||
|
||||
if original_host in PASSTHROUGH_DOMAINS:
|
||||
ctx.log.info(f"[PASSTHROUGH] {flow.request.method} {flow.request.pretty_url}")
|
||||
return
|
||||
|
||||
# Build the original HTTP URL (downgrade HTTPS → HTTP)
|
||||
# This is what WaybackProxy needs to look up in the archive
|
||||
original_path = flow.request.path # includes query string
|
||||
http_url = f"http://{original_host}{original_path}"
|
||||
|
||||
ctx.log.info(f"[WAYBACK] {flow.request.pretty_url} → {http_url}")
|
||||
|
||||
# Rewrite the request to go to WaybackProxy as an explicit HTTP proxy request
|
||||
# In explicit proxy mode, the request line contains the full URL
|
||||
flow.request.scheme = "http"
|
||||
flow.request.host = "172.30.0.3"
|
||||
flow.request.port = 8888
|
||||
|
||||
# Critical: set the URL that appears in the HTTP request line
|
||||
# WaybackProxy reads this to know what archived page to fetch
|
||||
flow.request.url = http_url
|
||||
|
||||
# Ensure the Host header matches the original site
|
||||
flow.request.headers["Host"] = original_host
|
||||
|
||||
def response(self, flow: http.HTTPFlow) -> None:
|
||||
# Don't scrub passthrough responses
|
||||
original_host = flow.request.headers.get("Host", flow.request.pretty_host)
|
||||
if original_host in PASSTHROUGH_DOMAINS:
|
||||
return
|
||||
|
||||
# Scrub headers that might reveal archive.org
|
||||
headers_to_remove = []
|
||||
for header_name in flow.response.headers:
|
||||
for prefix in SCRUB_HEADERS:
|
||||
if header_name.lower().startswith(prefix):
|
||||
headers_to_remove.append(header_name)
|
||||
break
|
||||
|
||||
for h in headers_to_remove:
|
||||
del flow.response.headers[h]
|
||||
|
||||
# Replace server header if it mentions archive infrastructure
|
||||
server_header = flow.response.headers.get("server", "")
|
||||
if "archive" in server_header.lower() or "wayback" in server_header.lower():
|
||||
flow.response.headers["server"] = "Apache/2.2.15"
|
||||
|
||||
# Scrub response body for text content
|
||||
content_type = flow.response.headers.get("content-type", "")
|
||||
if any(t in content_type for t in ["text/html", "text/css", "javascript", "application/json"]):
|
||||
try:
|
||||
body = flow.response.get_text()
|
||||
if body:
|
||||
for pattern, replacement in SCRUB_PATTERNS:
|
||||
body = pattern.sub(replacement, body)
|
||||
flow.response.set_text(body)
|
||||
except Exception as e:
|
||||
ctx.log.warn(f"[SCRUB] Failed to scrub response: {e}")
|
||||
|
||||
|
||||
addons = [TimeTravelRouter()]
|
||||
15
mitm/entrypoint.sh
Normal file
15
mitm/entrypoint.sh
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
# Ensure cert directories exist and are writable
|
||||
mkdir -p /home/mitmproxy/.mitmproxy
|
||||
echo "Cert directory ready: $(ls -la /home/mitmproxy/)"
|
||||
|
||||
echo "Starting mitmproxy with time-travel addon..."
|
||||
exec mitmdump \
|
||||
--listen-host 0.0.0.0 \
|
||||
--listen-port 8080 \
|
||||
--set confdir=/home/mitmproxy/.mitmproxy \
|
||||
--set ssl_insecure=true \
|
||||
-s /opt/addon.py \
|
||||
--showhost
|
||||
Reference in New Issue
Block a user