import csv, json, os, time, random, requests API_KEY="579b464db66ec23bdd000001acb35eb12c5943f068a59a162fc0d8fa" RID="5c2f62fe-5afa-4119-a499-fec9d604d5bd" BASE=f"https://api.data.gov.in/resource/{RID}" OUT="india_pincode_city_state.csv" STATE="state.json" PAGE=1000 SAVE_EVERY_OFFSETS = 10000 FLUSH_EVERY_ROWS = 5000 def pad6(x): s = str(x or "") return ("000000" + s)[-6:] def load_offset(): if os.path.exists(STATE): with open(STATE, "r", encoding="utf-8") as f: return int(json.load(f).get("offset", 0)) return 0 def save_offset(offset): tmp = STATE + ".tmp" with open(tmp, "w", encoding="utf-8") as f: json.dump({"offset": offset}, f) os.replace(tmp, STATE) offset = load_offset() new_file = not (os.path.exists(OUT) and os.path.getsize(OUT) > 0) f = open(OUT, "a", newline="", encoding="utf-8") w = csv.writer(f) if new_file: w.writerow(["pincode", "locality", "city", "state"]) f.flush() s = requests.Session() s.headers.update({ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120 Safari/537.36", "Accept": "application/json", "Accept-Language": "en-US,en;q=0.9", "Connection": "keep-alive", }) backoff = 1.0 rows = 0 while True: url = f"{BASE}?api-key={API_KEY}&format=json&limit={PAGE}&offset={offset}" try: r = s.get(url, timeout=(5, 45)) if r.status_code == 429: ra = r.headers.get("Retry-After") if ra and ra.isdigit(): sleep_s = int(ra) else: sleep_s = max(15, backoff) + random.random() * 2 print("429 at", offset, "sleep", round(sleep_s, 2), flush=True) time.sleep(sleep_s) backoff = min(backoff * 1.8, 300) # cap 5 min continue r.raise_for_status() data = r.json() except Exception as e: sleep_s = backoff + random.random() * 0.5 print("err at", offset, repr(e), "sleep", round(sleep_s, 2), flush=True) time.sleep(sleep_s) backoff = min(backoff * 1.7, 120) continue # sometimes API returns {"error": "..."} with 200 if isinstance(data, dict) and data.get("error") and not data.get("records"): sleep_s = backoff + random.random() * 0.5 print("api error at", offset, data["error"], "sleep", round(sleep_s, 2), flush=True) time.sleep(sleep_s) backoff = min(backoff * 1.7, 120) continue recs = (data.get("records") or []) if isinstance(data, dict) else [] got = len(recs) if not recs: print("done at", offset, flush=True) break backoff = 1.0 # reset on success for x in recs: w.writerow([ pad6(x.get("pincode")), (x.get("officename") or "").strip(), (x.get("district") or "").strip(), (x.get("statename") or "").strip(), ]) rows += got offset += got # <-- replace offset += PAGE if rows % FLUSH_EVERY_ROWS == 0: f.flush() if offset % SAVE_EVERY_OFFSETS == 0: save_offset(offset) if rows % 1000 == 0: print("asked", PAGE, "got", len(recs), "api_limit", data.get("limit"), "api_count", data.get("count")) # gentle pacing to avoid 429 storms (tune 0.2-1.0) # time.sleep(0.25) f.close() print("final rows written (this run):", rows, flush=True)