initial Api extracotr
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
.idea
|
||||
165628
india_pincode_city_state.csv
Normal file
165628
india_pincode_city_state.csv
Normal file
File diff suppressed because it is too large
Load Diff
115
main.py
Normal file
115
main.py
Normal file
@@ -0,0 +1,115 @@
|
||||
import csv, json, os, time, random, requests
|
||||
|
||||
API_KEY="579b464db66ec23bdd000001acb35eb12c5943f068a59a162fc0d8fa"
|
||||
RID="5c2f62fe-5afa-4119-a499-fec9d604d5bd"
|
||||
BASE=f"https://api.data.gov.in/resource/{RID}"
|
||||
|
||||
OUT="india_pincode_city_state.csv"
|
||||
STATE="state.json"
|
||||
PAGE=1000
|
||||
SAVE_EVERY_OFFSETS = 10000
|
||||
FLUSH_EVERY_ROWS = 5000
|
||||
|
||||
def pad6(x):
|
||||
s = str(x or "")
|
||||
return ("000000" + s)[-6:]
|
||||
|
||||
def load_offset():
|
||||
if os.path.exists(STATE):
|
||||
with open(STATE, "r", encoding="utf-8") as f:
|
||||
return int(json.load(f).get("offset", 0))
|
||||
return 0
|
||||
|
||||
def save_offset(offset):
|
||||
tmp = STATE + ".tmp"
|
||||
with open(tmp, "w", encoding="utf-8") as f:
|
||||
json.dump({"offset": offset}, f)
|
||||
os.replace(tmp, STATE)
|
||||
|
||||
offset = load_offset()
|
||||
|
||||
new_file = not (os.path.exists(OUT) and os.path.getsize(OUT) > 0)
|
||||
f = open(OUT, "a", newline="", encoding="utf-8")
|
||||
w = csv.writer(f)
|
||||
if new_file:
|
||||
w.writerow(["pincode", "locality", "city", "state"])
|
||||
f.flush()
|
||||
|
||||
s = requests.Session()
|
||||
s.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120 Safari/537.36",
|
||||
"Accept": "application/json",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Connection": "keep-alive",
|
||||
})
|
||||
|
||||
backoff = 1.0
|
||||
rows = 0
|
||||
|
||||
while True:
|
||||
url = f"{BASE}?api-key={API_KEY}&format=json&limit={PAGE}&offset={offset}"
|
||||
|
||||
try:
|
||||
r = s.get(url, timeout=(5, 45))
|
||||
|
||||
if r.status_code == 429:
|
||||
ra = r.headers.get("Retry-After")
|
||||
if ra and ra.isdigit():
|
||||
sleep_s = int(ra)
|
||||
else:
|
||||
sleep_s = max(15, backoff) + random.random() * 2
|
||||
|
||||
print("429 at", offset, "sleep", round(sleep_s, 2), flush=True)
|
||||
time.sleep(sleep_s)
|
||||
backoff = min(backoff * 1.8, 300) # cap 5 min
|
||||
continue
|
||||
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
|
||||
except Exception as e:
|
||||
sleep_s = backoff + random.random() * 0.5
|
||||
print("err at", offset, repr(e), "sleep", round(sleep_s, 2), flush=True)
|
||||
time.sleep(sleep_s)
|
||||
backoff = min(backoff * 1.7, 120)
|
||||
continue
|
||||
|
||||
# sometimes API returns {"error": "..."} with 200
|
||||
if isinstance(data, dict) and data.get("error") and not data.get("records"):
|
||||
sleep_s = backoff + random.random() * 0.5
|
||||
print("api error at", offset, data["error"], "sleep", round(sleep_s, 2), flush=True)
|
||||
time.sleep(sleep_s)
|
||||
backoff = min(backoff * 1.7, 120)
|
||||
continue
|
||||
|
||||
recs = (data.get("records") or []) if isinstance(data, dict) else []
|
||||
got = len(recs)
|
||||
if not recs:
|
||||
print("done at", offset, flush=True)
|
||||
break
|
||||
|
||||
backoff = 1.0 # reset on success
|
||||
|
||||
for x in recs:
|
||||
w.writerow([
|
||||
pad6(x.get("pincode")),
|
||||
(x.get("officename") or "").strip(),
|
||||
(x.get("district") or "").strip(),
|
||||
(x.get("statename") or "").strip(),
|
||||
])
|
||||
|
||||
rows += got
|
||||
offset += got # <-- replace offset += PAGE
|
||||
|
||||
if rows % FLUSH_EVERY_ROWS == 0:
|
||||
f.flush()
|
||||
if offset % SAVE_EVERY_OFFSETS == 0:
|
||||
save_offset(offset)
|
||||
|
||||
if rows % 1000 == 0:
|
||||
print("asked", PAGE, "got", len(recs), "api_limit", data.get("limit"), "api_count", data.get("count"))
|
||||
|
||||
# gentle pacing to avoid 429 storms (tune 0.2-1.0)
|
||||
# time.sleep(0.25)
|
||||
f.close()
|
||||
print("final rows written (this run):", rows, flush=True)
|
||||
Reference in New Issue
Block a user