Use SitemapKit to discover all URLs, then crawl them with Crawl4AI for LLM-friendly content extraction. The perfect combination for building AI training datasets.
import requests
from crawl4ai import AsyncWebCrawler
# Step 1: Get all URLs via SitemapKit
resp = requests.post(
"https://sitemapkit.com/api/v1/sitemap/full",
headers={"x-api-key": "YOUR_API_KEY", "Content-Type": "application/json"},
json={"url": "docs.example.com"}
)
urls = [u["loc"] for u in resp.json()["urls"]]
print(f"Found {len(urls)} URLs to crawl")
# Step 2: Crawl with Crawl4AI for LLM-ready content
async with AsyncWebCrawler() as crawler:
for url in urls[:50]:
result = await crawler.arun(url=url)
if result.success:
# result.markdown contains clean, LLM-ready text
save_to_dataset(url, result.markdown)sk_live_* API key./api/v1/sitemap/full endpoint to discover and extract all sitemaps from a domain in one call.POST /api/v1/sitemap/discover — Find all sitemaps on a domainPOST /api/v1/sitemap/extract — Parse a sitemap URL and extract all URLsPOST /api/v1/sitemap/full — Discover + extract in one call (recommended)100 free API calls/month. No credit card required.