privaterelay/country_utils.py (210 lines of code) (raw):
import logging
from string import ascii_uppercase
from django.http import HttpRequest
from django.utils.translation.trans_real import parse_accept_lang_header
info_logger = logging.getLogger("eventsinfo")
# Map a primary language to the most probable country
# Top country derived from CLDR42 Supplemental Data, Language-Territory Information
# with the exception of Spanish (es), which is mapped to Spain (es) instead of
# Mexico (mx), which has the most speakers, but usually specifies es-MX.
_PRIMARY_LANGUAGE_TO_COUNTRY = {
"ace": "ID", # # Acehnese -> Indonesia
"ach": "UG", # Acholi -> Uganda
"af": "ZA", # Afrikaans -> South Africa
"an": "ES", # Aragonese -> Spain
"ar": "EG", # Arabic -> Egypt
"arn": "CL", # Mapudungun -> Chile
"as": "IN", # Assamese -> India
"ast": "ES", # Asturian -> Spain
"az": "AZ", # Azerbaijani -> Azerbaijan
"be": "BY", # Belerusian -> Belarus
"bg": "BG", # Bulgarian -> Bulgaria
"bn": "BD", # Bengali -> Bangladesh
"bo": "CN", # Tibetan -> China
"br": "FR", # Breton -> France
"brx": "IN", # Bodo -> India
"bs": "BA", # Bosnian -> Bosnia and Herzegovina
"ca": "FR", # Catalan -> France
"cak": "MX", # Kaqchikel -> Mexico
"ckb": "IQ", # Central Kurdish -> Iraq
"cs": "CZ", # Czech -> Czech Republic
"cv": "RU", # Chuvash -> Russia
"cy": "GB", # Welsh -> United Kingdom
"da": "DK", # Danish -> Denmark
"de": "DE", # German -> Germany
"dsb": "DE", # Lower Sorbian -> Germany
"el": "GR", # Greek -> Greece
"en": "US", # English -> Canada
"eo": "SM", # Esperanto -> San Marino
"es": "ES", # Spanish -> Spain (instead of Mexico, top by population)
"et": "EE", # Estonian -> Estonia
"eu": "ES", # Basque -> Spain
"fa": "IR", # Persian -> Iran
"ff": "SN", # Fulah -> Senegal
"fi": "FI", # Finnish -> Finland
"fr": "FR", # French -> France
"frp": "FR", # Arpitan -> France
"fur": "IT", # Friulian -> Italy
"fy": "NL", # Frisian -> Netherlands
"ga": "IE", # Irish -> Ireland
"gd": "GB", # Scottish Gaelic -> United Kingdom
"gl": "ES", # Galician -> Spain
"gn": "PY", # Guarani -> Paraguay
"gu": "IN", # Gujarati -> India
"gv": "IM", # Manx -> Isle of Man
"he": "IL", # Hebrew -> Israel
"hi": "IN", # Hindi -> India
"hr": "HR", # Croatian -> Croatia
"hsb": "DE", # Upper Sorbian -> Germany
"hu": "HU", # Hungarian -> Hungary
"hy": "AM", # Armenian -> Armenia
"hye": "AM", # Armenian Eastern Classic Orthography -> Armenia
"ia": "FR", # Interlingua -> France
"id": "ID", # Indonesian -> Indonesia
"ilo": "PH", # Iloko -> Philippines
"is": "IS", # Icelandic -> Iceland
"it": "IT", # Italian -> Italy
"ixl": "MX", # Ixil -> Mexico
"ja": "JP", # Japanese -> Japan
"jiv": "MX", # Shuar -> Mexico
"ka": "GE", # Georgian -> Georgia
"kab": "DZ", # Kayble -> Algeria
"kk": "KZ", # Kazakh -> Kazakhstan
"km": "KH", # Khmer -> Cambodia
"kn": "IN", # Kannada -> India
"ko": "KR", # Korean -> South Korea
"ks": "IN", # Kashmiri -> India
"lb": "LU", # Luxembourgish -> Luxembourg
"lg": "UG", # Luganda -> Uganda
"lij": "IT", # Ligurian -> Italy
"lo": "LA", # Lao -> Laos
"lt": "LT", # Lithuanian -> Lithuania
"ltg": "LV", # Latgalian -> Latvia
"lus": "US", # Mizo -> United States
"lv": "LV", # Latvian -> Latvia
"mai": "IN", # Maithili -> India
"meh": "MX", # Mixteco Yucuhiti -> Mexico
"mix": "MX", # Mixtepec Mixtec -> Mexico
"mk": "MK", # Macedonian -> North Macedonia
"ml": "IN", # Malayalam -> India
"mr": "IN", # Marathi -> India
"ms": "MY", # Malay -> Malaysia
"my": "MM", # Burmese -> Myanmar
"nb": "NO", # Norwegian Bokmål -> Norway
"ne": "NP", # Nepali -> Nepal
"nl": "NL", # Dutch -> Netherlands
"nn": "NO", # Norwegian Nynorsk -> Norway
"oc": "FR", # Occitan -> France
"or": "IN", # Odia -> India
"pa": "IN", # Punjabi -> India
"pl": "PL", # Polish -> Poland
"ppl": "MX", # Náhuat Pipil -> Mexico
"pt": "BR", # Portuguese -> Brazil
"quc": "GT", # K'iche' -> Guatemala
"rm": "CH", # Romansh -> Switzerland
"ro": "RO", # Romanian -> Romania
"ru": "RU", # Russian -> Russia
"sat": "IN", # Santali (Ol Chiki) -> India
"sc": "IT", # Sardinian -> Italy
"scn": "IT", # Sicilian -> Italy
"sco": "GB", # Scots -> United Kingdom
"si": "LK", # Sinhala -> Sri Lanka
"sk": "SK", # Slovak -> Slovakia
"skr": "PK", # Saraiki -> Pakistan
"sl": "SI", # Slovenian -> Slovenia
"son": "ML", # Songhay -> Mali
"sq": "AL", # Albanian -> Albania
"sr": "RS", # Serbian -> Serbia
"sv": "SE", # Swedish -> Sweeden
"sw": "TZ", # Swahili -> Tanzania
"szl": "PL", # Silesian -> Poland
"ta": "IN", # Tamil -> India
"te": "IN", # Telugu -> India
"tg": "TJ", # Tajik -> Tajikistan
"th": "TH", # Thai -> Thailand
"tl": "PH", # Tagalog -> Philippines
"tr": "TR", # Turkish or Crimean Tatar -> Turkey
"trs": "MX", # Triqui -> Mexico
"uk": "UA", # Ukrainian -> Ukraine
"ur": "PK", # Urdu -> Pakistan
"uz": "UZ", # Uzbek -> Uzbekistan
"vi": "VN", # Vietnamese -> Vietnam
"wo": "SN", # Wolof -> Senegal
"xcl": "AM", # Armenian Classic -> Armenia
"xh": "ZA", # Xhosa -> South Africa
"zam": "MX", # Miahuatlán Zapotec -> Mexico
"zh": "CN", # Chinese -> China
}
# Special cases for language tags
_LANGUAGE_TAG_TO_COUNTRY_OVERRIDE = {
# Would be Catalan in Valencian script -> France
# Change to Valencian -> Spain
("ca", "VALENCIA"): "ES",
# Spanish in UN region 419 (Latin America and Carribean)
# Pick Mexico, which has highest number of Spanish speakers
("es", "419"): "MX",
# Would be Galician (Greenland) -> Greenland
# Change to Galician (Galicia region of Spain) -> Spain
("gl", "GL"): "ES",
}
class AcceptLanguageError(ValueError):
"""There was an issue processing the Accept-Language header."""
def __init__(self, message: str, accept_lang: str | None = None):
super().__init__(message)
self.accept_lang = accept_lang
def guess_country_from_accept_lang(accept_lang: str) -> str:
"""
Guess the user's country from the Accept-Language header
Return is a 2-letter ISO 3166 country code
If an issue is detected, a AcceptLanguageError is raised.
The header may come directly from a web request, or may be the header
captured by Mozilla Accounts (FxA) at signup.
Even with all this logic and special casing, it is still more accurate to
use a GeoIP lookup or a country code provided by the infrastructure.
See RFC 9110, "HTTP Semantics", section 12.5.4, "Accept-Language"
See RFC 5646, "Tags for Identifying Languages", and examples in Appendix A
"""
lang_q_pairs = parse_accept_lang_header(accept_lang.strip())
if not lang_q_pairs:
raise AcceptLanguageError("Invalid Accept-Language string", accept_lang)
top_lang_tag = lang_q_pairs[0][0]
subtags = top_lang_tag.split("-")
lang = subtags[0].lower()
if lang == "i":
raise AcceptLanguageError("Irregular language tag", accept_lang)
if lang == "x":
raise AcceptLanguageError("Private-use language tag", accept_lang)
if lang == "*":
raise AcceptLanguageError("Wildcard language tag", accept_lang)
if len(lang) < 2:
raise AcceptLanguageError("Invalid one-character primary language", accept_lang)
if len(lang) == 3 and lang[0] == "q" and lang[1] <= "t":
raise AcceptLanguageError(
"Private-use language tag (RFC 5646 2.2.1)", accept_lang
)
for maybe_region_raw in subtags[1:]:
maybe_region = maybe_region_raw.upper()
# Look for a special case
if override := _LANGUAGE_TAG_TO_COUNTRY_OVERRIDE.get((lang, maybe_region)):
return override
if len(maybe_region) <= 1:
# One-character extension or empty, stop processing
break
if (
len(maybe_region) == 2
and all(c in ascii_uppercase for c in maybe_region)
and
# RFC 5646 2.2.4 "Region Subtag" point 6, reserved subtags
maybe_region != "AA"
and maybe_region != "ZZ"
and maybe_region[0] != "X"
and (maybe_region[0] != "Q" or maybe_region[1] < "M")
):
# Subtag is a non-private ISO 3166 country code
return maybe_region
# Subtag is probably a script, like "Hans" in "zh-Hans-CN"
# Loop to the next subtag, which might be a ISO 3166 country code
# Guess the country from a simple language tag
try:
return _PRIMARY_LANGUAGE_TO_COUNTRY[lang]
except KeyError:
raise AcceptLanguageError("Unknown langauge", accept_lang)
def _get_cc_from_lang(accept_lang: str) -> str:
try:
return guess_country_from_accept_lang(accept_lang)
except AcceptLanguageError:
return ""
def _get_cc_from_request(request: HttpRequest) -> str:
"""Determine the user's region / country code."""
log_data: dict[str, str] = {}
cdn_region = None
region = None
if "X-Client-Region" in request.headers:
cdn_region = region = request.headers["X-Client-Region"].upper()
log_data["cdn_region"] = cdn_region
log_data["region_method"] = "cdn"
accept_language_region = None
if "Accept-Language" in request.headers:
log_data["accept_lang"] = request.headers["Accept-Language"]
accept_language_region = _get_cc_from_lang(request.headers["Accept-Language"])
log_data["accept_lang_region"] = accept_language_region
if region is None:
region = accept_language_region
log_data["region_method"] = "accept_lang"
if region is None:
region = "US"
log_data["region_method"] = "fallback"
log_data["region"] = region
# MPP-3284: Log details of region selection. Only log once per request, since some
# endpoints, like /api/v1/runtime_data, call this multiple times.
if not getattr(request, "_logged_region_details", False):
setattr(request, "_logged_region_details", True)
info_logger.info("region_details", extra=log_data)
return region