initial Api extracotr

This commit is contained in:
androidlover5842
2026-02-05 19:33:37 +05:30
commit 17d52ee41d
3 changed files with 165744 additions and 0 deletions

115
main.py Normal file
View File

@@ -0,0 +1,115 @@
import csv, json, os, time, random, requests
API_KEY="579b464db66ec23bdd000001acb35eb12c5943f068a59a162fc0d8fa"
RID="5c2f62fe-5afa-4119-a499-fec9d604d5bd"
BASE=f"https://api.data.gov.in/resource/{RID}"
OUT="india_pincode_city_state.csv"
STATE="state.json"
PAGE=1000
SAVE_EVERY_OFFSETS = 10000
FLUSH_EVERY_ROWS = 5000
def pad6(x):
s = str(x or "")
return ("000000" + s)[-6:]
def load_offset():
if os.path.exists(STATE):
with open(STATE, "r", encoding="utf-8") as f:
return int(json.load(f).get("offset", 0))
return 0
def save_offset(offset):
tmp = STATE + ".tmp"
with open(tmp, "w", encoding="utf-8") as f:
json.dump({"offset": offset}, f)
os.replace(tmp, STATE)
offset = load_offset()
new_file = not (os.path.exists(OUT) and os.path.getsize(OUT) > 0)
f = open(OUT, "a", newline="", encoding="utf-8")
w = csv.writer(f)
if new_file:
w.writerow(["pincode", "locality", "city", "state"])
f.flush()
s = requests.Session()
s.headers.update({
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120 Safari/537.36",
"Accept": "application/json",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
})
backoff = 1.0
rows = 0
while True:
url = f"{BASE}?api-key={API_KEY}&format=json&limit={PAGE}&offset={offset}"
try:
r = s.get(url, timeout=(5, 45))
if r.status_code == 429:
ra = r.headers.get("Retry-After")
if ra and ra.isdigit():
sleep_s = int(ra)
else:
sleep_s = max(15, backoff) + random.random() * 2
print("429 at", offset, "sleep", round(sleep_s, 2), flush=True)
time.sleep(sleep_s)
backoff = min(backoff * 1.8, 300) # cap 5 min
continue
r.raise_for_status()
data = r.json()
except Exception as e:
sleep_s = backoff + random.random() * 0.5
print("err at", offset, repr(e), "sleep", round(sleep_s, 2), flush=True)
time.sleep(sleep_s)
backoff = min(backoff * 1.7, 120)
continue
# sometimes API returns {"error": "..."} with 200
if isinstance(data, dict) and data.get("error") and not data.get("records"):
sleep_s = backoff + random.random() * 0.5
print("api error at", offset, data["error"], "sleep", round(sleep_s, 2), flush=True)
time.sleep(sleep_s)
backoff = min(backoff * 1.7, 120)
continue
recs = (data.get("records") or []) if isinstance(data, dict) else []
got = len(recs)
if not recs:
print("done at", offset, flush=True)
break
backoff = 1.0 # reset on success
for x in recs:
w.writerow([
pad6(x.get("pincode")),
(x.get("officename") or "").strip(),
(x.get("district") or "").strip(),
(x.get("statename") or "").strip(),
])
rows += got
offset += got # <-- replace offset += PAGE
if rows % FLUSH_EVERY_ROWS == 0:
f.flush()
if offset % SAVE_EVERY_OFFSETS == 0:
save_offset(offset)
if rows % 1000 == 0:
print("asked", PAGE, "got", len(recs), "api_limit", data.get("limit"), "api_count", data.get("count"))
# gentle pacing to avoid 429 storms (tune 0.2-1.0)
# time.sleep(0.25)
f.close()
print("final rows written (this run):", rows, flush=True)