Build URL inventories for AI training pipelines
Collect comprehensive URL lists across domains for AI training data pipelines. Sitemaps are the most efficient way to discover all content on a domain without crawling.
import requests
import json
domains = ["docs.python.org", "react.dev", "developer.mozilla.org"]
all_urls = []
for domain in domains:
resp = requests.post(
"https://sitemapkit.com/api/v1/sitemap/full",
headers={"x-api-key": "YOUR_API_KEY"},
json={"url": domain}
)
urls = resp.json().get("urls", [])
all_urls.extend([u["loc"] for u in urls])
print(f"Collected {len(all_urls)} URLs for training data")
with open("training_urls.json", "w") as f:
json.dump(all_urls, f)Free tier includes 100 API calls/month. No credit card required.