115 lines
3.3 KiB
Python
115 lines
3.3 KiB
Python
import csv, json, os, time, random, requests
|
|
|
|
API_KEY="579b464db66ec23bdd000001acb35eb12c5943f068a59a162fc0d8fa"
|
|
RID="5c2f62fe-5afa-4119-a499-fec9d604d5bd"
|
|
BASE=f"https://api.data.gov.in/resource/{RID}"
|
|
|
|
OUT="india_pincode_city_state.csv"
|
|
STATE="state.json"
|
|
PAGE=1000
|
|
SAVE_EVERY_OFFSETS = 10000
|
|
FLUSH_EVERY_ROWS = 5000
|
|
|
|
def pad6(x):
|
|
s = str(x or "")
|
|
return ("000000" + s)[-6:]
|
|
|
|
def load_offset():
|
|
if os.path.exists(STATE):
|
|
with open(STATE, "r", encoding="utf-8") as f:
|
|
return int(json.load(f).get("offset", 0))
|
|
return 0
|
|
|
|
def save_offset(offset):
|
|
tmp = STATE + ".tmp"
|
|
with open(tmp, "w", encoding="utf-8") as f:
|
|
json.dump({"offset": offset}, f)
|
|
os.replace(tmp, STATE)
|
|
|
|
offset = load_offset()
|
|
|
|
new_file = not (os.path.exists(OUT) and os.path.getsize(OUT) > 0)
|
|
f = open(OUT, "a", newline="", encoding="utf-8")
|
|
w = csv.writer(f)
|
|
if new_file:
|
|
w.writerow(["pincode", "locality", "city", "state"])
|
|
f.flush()
|
|
|
|
s = requests.Session()
|
|
s.headers.update({
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120 Safari/537.36",
|
|
"Accept": "application/json",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Connection": "keep-alive",
|
|
})
|
|
|
|
backoff = 1.0
|
|
rows = 0
|
|
|
|
while True:
|
|
url = f"{BASE}?api-key={API_KEY}&format=json&limit={PAGE}&offset={offset}"
|
|
|
|
try:
|
|
r = s.get(url, timeout=(5, 45))
|
|
|
|
if r.status_code == 429:
|
|
ra = r.headers.get("Retry-After")
|
|
if ra and ra.isdigit():
|
|
sleep_s = int(ra)
|
|
else:
|
|
sleep_s = max(15, backoff) + random.random() * 2
|
|
|
|
print("429 at", offset, "sleep", round(sleep_s, 2), flush=True)
|
|
time.sleep(sleep_s)
|
|
backoff = min(backoff * 1.8, 300) # cap 5 min
|
|
continue
|
|
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
|
|
except Exception as e:
|
|
sleep_s = backoff + random.random() * 0.5
|
|
print("err at", offset, repr(e), "sleep", round(sleep_s, 2), flush=True)
|
|
time.sleep(sleep_s)
|
|
backoff = min(backoff * 1.7, 120)
|
|
continue
|
|
|
|
# sometimes API returns {"error": "..."} with 200
|
|
if isinstance(data, dict) and data.get("error") and not data.get("records"):
|
|
sleep_s = backoff + random.random() * 0.5
|
|
print("api error at", offset, data["error"], "sleep", round(sleep_s, 2), flush=True)
|
|
time.sleep(sleep_s)
|
|
backoff = min(backoff * 1.7, 120)
|
|
continue
|
|
|
|
recs = (data.get("records") or []) if isinstance(data, dict) else []
|
|
got = len(recs)
|
|
if not recs:
|
|
print("done at", offset, flush=True)
|
|
break
|
|
|
|
backoff = 1.0 # reset on success
|
|
|
|
for x in recs:
|
|
w.writerow([
|
|
pad6(x.get("pincode")),
|
|
(x.get("officename") or "").strip(),
|
|
(x.get("district") or "").strip(),
|
|
(x.get("statename") or "").strip(),
|
|
])
|
|
|
|
rows += got
|
|
offset += got # <-- replace offset += PAGE
|
|
|
|
if rows % FLUSH_EVERY_ROWS == 0:
|
|
f.flush()
|
|
if offset % SAVE_EVERY_OFFSETS == 0:
|
|
save_offset(offset)
|
|
|
|
if rows % 1000 == 0:
|
|
print("asked", PAGE, "got", len(recs), "api_limit", data.get("limit"), "api_count", data.get("count"))
|
|
|
|
# gentle pacing to avoid 429 storms (tune 0.2-1.0)
|
|
# time.sleep(0.25)
|
|
f.close()
|
|
print("final rows written (this run):", rows, flush=True) |