Source code for catfish_sim.utils

import random
import pandas as pd
import scipy.stats
import math
import os

LLCP2022_AGE_GROUP_RANGE = [1, 13]
LLCP2022_HEIGHT_RANGE = [91, 241]
LLCP2022_BMI_GROUP_RANGE = [1, 4]
LLCP2022_DATA = pd.read_csv(
    os.path.join(os.path.dirname(__file__), "data/LLCP2022_sex_age_height_bmi.csv")
)


[docs] def get_random_gender(male_weight=0.72): """Samples gender. Args: male_weight (float, optional): Male gender probability. Defaults to 0.72 (based on online dating populations). Returns: _type_: _description_ """ return random.choices( ["Male", "Female"], weights=[male_weight, (1 - male_weight)], k=1 )[0]
[docs] def get_random_strategy(strategies, strategy_weights=None): """Chooses, instantiates, and returns a random strategy from the provided strategies. Args: strategies (list): A list of strategy classes. strategy_weights (list, optional): A list of strategy weights for sampling. Defaults to None. Returns: An instantiated strategy object. """ return random.choices(strategies, weights=strategy_weights, k=1)[0]()
[docs] def get_agent_stats(agents): """Retrieves the provided agents' current status. Args: agents (list): A list of agent objects. Returns: DataFrame: A pandas DataFrame with agents' status. """ stats = [] for agent in agents: stat = { "ID": agent.id, "STRATEGY": agent.strategy.name, "ATTRACTIVENESS": agent.attractiveness, "EST_ATTRACTIVENESS": agent.estimated_attractiveness, "REPORTED_ATTRIBUTES": agent.reported_attributes, "HIDDEN_ATTRIBUTES": agent.hidden_attributes, "LIKED": len(agent.liked), "PASSED": len(agent.passed), "MATCHED": agent.match_count, "HAPPINESS": agent.happiness, } stats.append(stat) return pd.DataFrame(stats)
[docs] def sample_bmi_from_sex_age(sex, age_group): """Samples a weight group based on the provided sex and age group based on the data. Based on LLCP2022: https://www.cdc.gov/brfss/annual_data/annual_2022.html Args: sex (str): "Male" or "Female." age_group (str|int): Age group ID (1: 18-24, 2: 25-29, 3: 30-34, 4: 35-39, 5: 40-44, 6: 45-49, 7: 50-54, 8: 55-59, 9: 60-64, 10: 65-69, 11: 70-74, 12: 75-79, 13: 80+). Returns: int: BMI group ID (1: Underweight, 2: Normal weight, 3: Overweight, 4: Obesity) """ age = str(age_group) sex_age_bmi = { "Male": { "1": { # 18-24 "1": 0.04840228968915296, "2": 0.47235707557423373, "3": 0.27845808274762696, "4": 0.2007825519889863, }, "2": { # 25-29 "1": 0.023654697954952408, "2": 0.3539722929035906, "3": 0.35793044953350295, "4": 0.264442559607954, }, "3": { # 30-34 "1": 0.012286689419795221, "2": 0.29197952218430034, "3": 0.38515358361774743, "4": 0.310580204778157, }, "4": { # 35-39 "1": 0.009686743223185688, "2": 0.2456839309428951, "3": 0.3991875634716038, "4": 0.34544176236231544, }, "5": { # 40-44 "1": 0.006257153758107593, "2": 0.21404044257916827, "3": 0.41091186570011445, "4": 0.3687905379626097, }, "6": { # 45-49 "1": 0.006302353410450738, "2": 0.18994814519345832, "3": 0.40095731950538493, "4": 0.40279218189070604, }, "7": { # 50-54 "1": 0.007841518778373916, "2": 0.16962443252166737, "3": 0.412573944146375, "4": 0.4099601045535837, }, "8": { # 55-59 "1": 0.005810718921926191, "2": 0.17765964022995612, "3": 0.41045929405946713, "4": 0.40607034678865056, }, "9": { # 60-64 "1": 0.008853730092204526, "2": 0.19975901089689857, "3": 0.41733025984911987, "4": 0.37405699916177704, }, "10": { # 65-69 "1": 0.007913389649480532, "2": 0.20919506748227984, "3": 0.4301388484318866, "4": 0.35275269443635304, }, "11": { # 70-74 "1": 0.008408219034105838, "2": 0.23422145146881077, "3": 0.4316569446634085, "4": 0.3257133848336749, }, "12": { # 75-79 "1": 0.007979312892500923, "2": 0.2574067233099372, "3": 0.4494274104174363, "4": 0.2851865533801256, }, "13": { # 80+ "1": 0.012981298129812982, "2": 0.3428676200953429, "3": 0.44620462046204623, "4": 0.19794646131279794, }, }, "Female": { "1": { # 18-24 "1": 0.04786969907637303, "2": 0.48912503724302314, "3": 0.23408481477803159, "4": 0.22892044890257224, }, "2": { # 25-29 "1": 0.02436781609195402, "2": 0.38114942528735635, "3": 0.26586206896551723, "4": 0.32862068965517244, }, "3": { # 30-34 "1": 0.017873853640276997, "2": 0.3429721130451057, "3": 0.27063447501403703, "4": 0.3685195583005802, }, "4": { # 35-39 "1": 0.016477272727272726, "2": 0.32987012987012987, "3": 0.2814935064935065, "4": 0.3721590909090909, }, "5": { # 40-44 "1": 0.012779552715654952, "2": 0.30277137974589496, "3": 0.2917750204324244, "4": 0.3926740471060257, }, "6": { # 45-49 "1": 0.011997832649585882, "2": 0.2928245220218283, "3": 0.2927471166498955, "4": 0.4024305286786903, }, "7": { # 50-54 "1": 0.012770074169622702, "2": 0.28068365043534343, "3": 0.29532408900354723, "4": 0.4112221863914866, }, "8": { # 55-59 "1": 0.015816598542740357, "2": 0.28837154197026243, "3": 0.3054321426455779, "4": 0.39037971684141937, }, "9": { # 60-64 "1": 0.01994965088110958, "2": 0.30028974492946375, "3": 0.3140170047024177, "4": 0.36574359948700896, }, "10": { # 65-69 "1": 0.020012620571531598, "2": 0.3116830433606779, "3": 0.31393671684846297, "4": 0.3543676192193275, }, "11": { # 70-74 "1": 0.021507459794613448, "2": 0.3193664018601046, "3": 0.32454950590970744, "4": 0.3345766324355745, }, "12": { # 75-79 "1": 0.025180295807358515, "2": 0.34842928737318174, "3": 0.33944505561667276, "4": 0.28694536120278696, }, "13": { # 80+ "1": 0.03670282875392882, "2": 0.4369360235222549, "3": 0.3268275372604684, "4": 0.19953361046334786, }, }, } bmi_probs = sex_age_bmi[sex][age] return int( random.choices( population=list(bmi_probs.keys()), weights=bmi_probs.values(), k=1 )[0] )
[docs] def sample_age_from_sex(sex, consider_dating_population=True): """Samples age from sex (based on LLCP2022: https://www.cdc.gov/brfss/annual_data/annual_2022.html) and optionally calibrates the probabilities according to online dating age penetration distribution (based on https://www.pewresearch.org/short-reads/2023/02/02/key-findings-about-online-dating-in-the-u-s/). Args: sex (str): "Male" or "Female." consider_dating_population (bool, optional): Indicates whether the age probabilities are calibrated using the online dating distributions. Defaults to True. Returns: int: Age group ID (1: 18-24, 2: 25-29, 3: 30-34, 4: 35-39, 5: 40-44, 6: 45-49, 7: 50-54, 8: 55-59, 9: 60-64, 10: 65-69, 11: 70-74, 12: 75-79, 13: 80+). """ sex_age = { "Male": { "1": 0.07398741365778994, "2": 0.05641430611817123, "3": 0.06279522089469297, "4": 0.06805496653553862, "5": 0.07010719351866747, "6": 0.06602223814607372, "7": 0.07659535056034084, "8": 0.08438503872908167, "9": 0.0991113515937663, "10": 0.10595535796980643, "11": 0.09738572605450832, "12": 0.06916638637438274, "13": 0.07001944984717977, }, "Female": { "1": 0.05094192542549045, "2": 0.045112814516478285, "3": 0.055974189077995756, "4": 0.06307652332077433, "5": 0.06738556147416742, "6": 0.06490407518080638, "7": 0.0776536312849162, "8": 0.0844917933393963, "9": 0.10471179247325797, "10": 0.1098393313412152, "11": 0.10174526871941449, "12": 0.07937724654627344, "13": 0.09478584729981378, }, } online_dating_weights = { "1": 0.53, "2": 0.53, "3": 0.37, "4": 0.37, "5": 0.37, "6": 0.37, "7": 0.20, "8": 0.20, "9": 0.20, "10": 0.13, "11": 0.13, "12": 0.13, "13": 0.13, } age_probs = sex_age[sex] if consider_dating_population: for age in age_probs.keys(): age_probs[age] *= online_dating_weights[age] return int( random.choices( population=list(age_probs.keys()), weights=age_probs.values(), k=1 )[0] )
[docs] def sample_height_from_sex_age(sex, age_group): """Samples height from the provided sex and age group (based on LLCP2022: https://www.cdc.gov/brfss/annual_data/annual_2022.html). Args: sex (str): "Male" or "Female." age_group (str|int): Age group ID (1: 18-24, 2: 25-29, 3: 30-34, 4: 35-39, 5: 40-44, 6: 45-49, 7: 50-54, 8: 55-59, 9: 60-64, 10: 65-69, 11: 70-74, 12: 75-79, 13: 80+). Returns: int: Rounded sampled height. """ if sex == "Male": sex = 1 else: # Female sex = 2 age = int(age_group) dataset = LLCP2022_DATA kde = scipy.stats.gaussian_kde( dataset[ (dataset["_SEX"] == sex) & (dataset["_AGEG5YR"] == age) & (~dataset["HTM4"].isnull()) ]["HTM4"].tolist() ) return int(kde.resample(1)[0][0])
[docs] def sample_age_preference(sex, age_group, allowed_diff=3): """Samples age preference for a given sex and age group (based on the 2017 US Current Population Survey: https://www2.census.gov/programs-surveys/demo/tables/families/2017/cps-2017/tabfg3-all.xls), along with the user-provided allowed maximum difference using age group IDs. Args: sex (str): "Male" or "Female." age_group (str|int): Age group ID (1: 18-24, 2: 25-29, 3: 30-34, 4: 35-39, 5: 40-44, 6: 45-49, 7: 50-54, 8: 55-59, 9: 60-64, 10: 65-69, 11: 70-74, 12: 75-79, 13: 80+). allowed_diff (int, optional): Allowed maximum difference using age group IDs. Defaults to 3. Returns: list: Age preference range (minimum, maximum). """ observed_differences = { "5": 0.01, # Husband 20+ years older than wife "4": 0.014, # Husband 15-19 years older than wife "3": 0.047, # Husband 10-14 years older than wife "2": 0.115, # Husband 6-9 years older than wife "1": 0.122, # Husband 4-5 years older than wife "0": 0.617, # Husband 2-3 years older than wife / Husband and wife within 1 year / Wife 2-3 years older than husband "-1": 0.032, # Wife 4-5 years older than husband "-2": 0.027, # Wife 6-9 years older than husband "-3": 0.009, # Wife 10-14 years older than husband "-4": 0.003, # Wife 15-19 years older than husband "-5": 0.004, # Wife 20+ years older than husband } age = age_group random_diff = int( random.choices( population=list(observed_differences.keys()), weights=observed_differences.values(), k=1, )[0] ) if random_diff == 0: preference = [ max(1, math.ceil(age - allowed_diff / 2)), min(13, math.floor(age + allowed_diff / 2)), ] return preference elif random_diff > 0: # Older-male if sex == "Male": pref_min = max(1, age - random_diff) pref_max = max(age, pref_min + allowed_diff) else: # Female pref_max = min(13, age + random_diff) pref_min = min(age, pref_max - allowed_diff) else: # Older-female if sex == "Male": pref_max = min(13, age - random_diff) pref_min = min(age, pref_max - allowed_diff) else: # Female pref_min = max(1, age + random_diff) pref_max = max(age, pref_min + allowed_diff) preference = [pref_min, pref_max] return preference
[docs] def get_height_preference(gender, height): """Deterministically obtains a height preference range for a given gender and height. Uses the height range from LLCP2022 data. Args: gender (str): Agent gender. height (int): Agent height. Returns: list: Height preference range, as [min, max]. """ allowed_height_range = LLCP2022_HEIGHT_RANGE if gender == "Male": return [ max(height - 30, allowed_height_range[0]), min(height + 10, allowed_height_range[1]), ] else: # Female return [ max(height + 5, allowed_height_range[0]), min(height + 50, allowed_height_range[1]), ]
[docs] def get_bmi_preference(gender, bmi): """Deterministically obtains a weight preference. Args: gender (str): Agent gender. bmi (int): Agent BMI group ID. Returns: list: BMI preference range, as [min, max]. """ if gender == "Male": return [1, min(4, bmi)] else: # Female if bmi > 1: return [max(2, bmi - 1), min(4, bmi + 1)] else: return [bmi, bmi + 1]