Normalize extracted address fields
All checks were successful
build-and-deploy / build-deploy (push) Successful in 32s
All checks were successful
build-and-deploy / build-deploy (push) Successful in 32s
This commit is contained in:
@@ -244,6 +244,7 @@ class DocumentExtractionService(
|
||||
normalizePinCode(results)
|
||||
normalizeIdNumber(results)
|
||||
markAadhaarIfValid(results)
|
||||
normalizeAddress(results)
|
||||
applyBookingCityUpdates(document, results)
|
||||
results["docType"] = computeDocType(results, handled)
|
||||
applyGuestUpdates(document, propertyId, results)
|
||||
@@ -282,6 +283,13 @@ class DocumentExtractionService(
|
||||
}
|
||||
}
|
||||
|
||||
private fun normalizeAddress(results: MutableMap<String, String>) {
|
||||
val key = DocumentPrompts.ADDRESS.first
|
||||
val raw = cleanedValue(results[key]) ?: return
|
||||
val normalized = cleanAddress(raw) ?: return
|
||||
results[key] = normalized
|
||||
}
|
||||
|
||||
private fun markAadhaarIfValid(results: MutableMap<String, String>) {
|
||||
val idKey = DocumentPrompts.ID_NUMBER.first
|
||||
val digits = normalizeDigits(cleanedValue(results[idKey]))
|
||||
@@ -519,3 +527,54 @@ private fun isValidPin(value: String?): Boolean {
|
||||
if (value.isNullOrBlank()) return false
|
||||
return pinCodeRegex.matches(value)
|
||||
}
|
||||
|
||||
private fun cleanAddress(raw: String): String? {
|
||||
val relationRegex = Regex("^\\s*(S/O|D/O|W/O|C/O|H/O|F/O)\\b", RegexOption.IGNORE_CASE)
|
||||
val prefixRegexes = listOf(
|
||||
Regex("^\\s*ADDRESS\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*HOUSE\\s*/\\s*BLDG\\.?\\s*/\\s*APT\\.?\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*HOUSE-?BLDG\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*HOUSE\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*H\\s*NO\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*HOUSE\\s*NO\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*STREET/ROAD/LANE\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*STREET\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*ROAD\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*LANE\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*AREA\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*SECTOR\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*COLONY\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*VILLAGE/TOWN/CITY\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*VILLAGE\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*TOWN\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*CITY\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*P\\.?\\s*O\\.?\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*POST\\s*OFFICE\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*P\\.?\\s*DIST\\.?\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*DISTRICT\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*DIST\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*STATE\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*PIN\\s*CODE\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*PINCODE\\b[:\\-\\s]*", RegexOption.IGNORE_CASE),
|
||||
Regex("^\\s*PIN\\b[:\\-\\s]*", RegexOption.IGNORE_CASE)
|
||||
)
|
||||
val dropPhrases = setOf(
|
||||
"address", "addr", "area", "area was", "street/road/lane", "village/town/city", "colony"
|
||||
)
|
||||
val parts = raw.replace("\n", ",").split(",")
|
||||
val cleanedParts = mutableListOf<String>()
|
||||
for (part in parts) {
|
||||
var value = part.trim()
|
||||
if (value.isBlank()) continue
|
||||
if (relationRegex.containsMatchIn(value)) continue
|
||||
for (regex in prefixRegexes) {
|
||||
value = regex.replace(value, "").trim()
|
||||
}
|
||||
value = value.replace(Regex("\\s+"), " ").trim()
|
||||
if (value.isBlank()) continue
|
||||
if (value.length < 3) continue
|
||||
if (dropPhrases.contains(value.lowercase())) continue
|
||||
cleanedParts.add(value)
|
||||
}
|
||||
return if (cleanedParts.isEmpty()) null else cleanedParts.joinToString(", ")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user