Source code for catfish_sim.utils

import random
import pandas as pd
import scipy.stats
import math
import os

LLCP2022_AGE_GROUP_RANGE = [1, 13]
LLCP2022_HEIGHT_RANGE = [91, 241]
LLCP2022_BMI_GROUP_RANGE = [1, 4]
LLCP2022_DATA = pd.read_csv(
    os.path.join(os.path.dirname(__file__), "data/LLCP2022_sex_age_height_bmi.csv")
)



[docs]
def get_random_gender(male_weight=0.72):
    """Samples gender.

    Args:
        male_weight (float, optional): Male gender probability. Defaults to 0.72 (based on online dating populations).

    Returns:
        _type_: _description_
    """
    return random.choices(
        ["Male", "Female"], weights=[male_weight, (1 - male_weight)], k=1
    )[0]




[docs]
def get_random_strategy(strategies, strategy_weights=None):
    """Chooses, instantiates, and returns a random strategy from the provided strategies.

    Args:
        strategies (list): A list of strategy classes.
        strategy_weights (list, optional): A list of strategy weights for sampling. Defaults to None.

    Returns:
        An instantiated strategy object.
    """
    return random.choices(strategies, weights=strategy_weights, k=1)[0]()




[docs]
def get_agent_stats(agents):
    """Retrieves the provided agents' current status.

    Args:
        agents (list): A list of agent objects.

    Returns:
        DataFrame: A pandas DataFrame with agents' status.
    """
    stats = []
    for agent in agents:
        stat = {
            "ID": agent.id,
            "STRATEGY": agent.strategy.name,
            "ATTRACTIVENESS": agent.attractiveness,
            "EST_ATTRACTIVENESS": agent.estimated_attractiveness,
            "REPORTED_ATTRIBUTES": agent.reported_attributes,
            "HIDDEN_ATTRIBUTES": agent.hidden_attributes,
            "LIKED": len(agent.liked),
            "PASSED": len(agent.passed),
            "MATCHED": agent.match_count,
            "HAPPINESS": agent.happiness,
        }
        stats.append(stat)
    return pd.DataFrame(stats)




[docs]
def sample_bmi_from_sex_age(sex, age_group):
    """Samples a weight group based on the provided sex and age group based on the data. Based on LLCP2022:
    https://www.cdc.gov/brfss/annual_data/annual_2022.html

    Args:
        sex (str): "Male" or "Female."
        age_group (str|int): Age group ID (1: 18-24, 2: 25-29, 3: 30-34, 4: 35-39, 5: 40-44, 6: 45-49, 7: 50-54,
            8: 55-59, 9: 60-64, 10: 65-69, 11: 70-74, 12: 75-79, 13: 80+).

    Returns:
        int: BMI group ID (1: Underweight, 2: Normal weight, 3: Overweight, 4: Obesity)
    """
    age = str(age_group)

    sex_age_bmi = {
        "Male": {
            "1": {  # 18-24
                "1": 0.04840228968915296,
                "2": 0.47235707557423373,
                "3": 0.27845808274762696,
                "4": 0.2007825519889863,
            },
            "2": {  # 25-29
                "1": 0.023654697954952408,
                "2": 0.3539722929035906,
                "3": 0.35793044953350295,
                "4": 0.264442559607954,
            },
            "3": {  # 30-34
                "1": 0.012286689419795221,
                "2": 0.29197952218430034,
                "3": 0.38515358361774743,
                "4": 0.310580204778157,
            },
            "4": {  # 35-39
                "1": 0.009686743223185688,
                "2": 0.2456839309428951,
                "3": 0.3991875634716038,
                "4": 0.34544176236231544,
            },
            "5": {  # 40-44
                "1": 0.006257153758107593,
                "2": 0.21404044257916827,
                "3": 0.41091186570011445,
                "4": 0.3687905379626097,
            },
            "6": {  # 45-49
                "1": 0.006302353410450738,
                "2": 0.18994814519345832,
                "3": 0.40095731950538493,
                "4": 0.40279218189070604,
            },
            "7": {  # 50-54
                "1": 0.007841518778373916,
                "2": 0.16962443252166737,
                "3": 0.412573944146375,
                "4": 0.4099601045535837,
            },
            "8": {  # 55-59
                "1": 0.005810718921926191,
                "2": 0.17765964022995612,
                "3": 0.41045929405946713,
                "4": 0.40607034678865056,
            },
            "9": {  # 60-64
                "1": 0.008853730092204526,
                "2": 0.19975901089689857,
                "3": 0.41733025984911987,
                "4": 0.37405699916177704,
            },
            "10": {  # 65-69
                "1": 0.007913389649480532,
                "2": 0.20919506748227984,
                "3": 0.4301388484318866,
                "4": 0.35275269443635304,
            },
            "11": {  # 70-74
                "1": 0.008408219034105838,
                "2": 0.23422145146881077,
                "3": 0.4316569446634085,
                "4": 0.3257133848336749,
            },
            "12": {  # 75-79
                "1": 0.007979312892500923,
                "2": 0.2574067233099372,
                "3": 0.4494274104174363,
                "4": 0.2851865533801256,
            },
            "13": {  # 80+
                "1": 0.012981298129812982,
                "2": 0.3428676200953429,
                "3": 0.44620462046204623,
                "4": 0.19794646131279794,
            },
        },
        "Female": {
            "1": {  # 18-24
                "1": 0.04786969907637303,
                "2": 0.48912503724302314,
                "3": 0.23408481477803159,
                "4": 0.22892044890257224,
            },
            "2": {  # 25-29
                "1": 0.02436781609195402,
                "2": 0.38114942528735635,
                "3": 0.26586206896551723,
                "4": 0.32862068965517244,
            },
            "3": {  # 30-34
                "1": 0.017873853640276997,
                "2": 0.3429721130451057,
                "3": 0.27063447501403703,
                "4": 0.3685195583005802,
            },
            "4": {  # 35-39
                "1": 0.016477272727272726,
                "2": 0.32987012987012987,
                "3": 0.2814935064935065,
                "4": 0.3721590909090909,
            },
            "5": {  # 40-44
                "1": 0.012779552715654952,
                "2": 0.30277137974589496,
                "3": 0.2917750204324244,
                "4": 0.3926740471060257,
            },
            "6": {  # 45-49
                "1": 0.011997832649585882,
                "2": 0.2928245220218283,
                "3": 0.2927471166498955,
                "4": 0.4024305286786903,
            },
            "7": {  # 50-54
                "1": 0.012770074169622702,
                "2": 0.28068365043534343,
                "3": 0.29532408900354723,
                "4": 0.4112221863914866,
            },
            "8": {  # 55-59
                "1": 0.015816598542740357,
                "2": 0.28837154197026243,
                "3": 0.3054321426455779,
                "4": 0.39037971684141937,
            },
            "9": {  # 60-64
                "1": 0.01994965088110958,
                "2": 0.30028974492946375,
                "3": 0.3140170047024177,
                "4": 0.36574359948700896,
            },
            "10": {  # 65-69
                "1": 0.020012620571531598,
                "2": 0.3116830433606779,
                "3": 0.31393671684846297,
                "4": 0.3543676192193275,
            },
            "11": {  # 70-74
                "1": 0.021507459794613448,
                "2": 0.3193664018601046,
                "3": 0.32454950590970744,
                "4": 0.3345766324355745,
            },
            "12": {  # 75-79
                "1": 0.025180295807358515,
                "2": 0.34842928737318174,
                "3": 0.33944505561667276,
                "4": 0.28694536120278696,
            },
            "13": {  # 80+
                "1": 0.03670282875392882,
                "2": 0.4369360235222549,
                "3": 0.3268275372604684,
                "4": 0.19953361046334786,
            },
        },
    }

    bmi_probs = sex_age_bmi[sex][age]
    return int(
        random.choices(
            population=list(bmi_probs.keys()), weights=bmi_probs.values(), k=1
        )[0]
    )




[docs]
def sample_age_from_sex(sex, consider_dating_population=True):
    """Samples age from sex (based on LLCP2022: https://www.cdc.gov/brfss/annual_data/annual_2022.html) and optionally
    calibrates the probabilities according to online dating age penetration distribution (based on
    https://www.pewresearch.org/short-reads/2023/02/02/key-findings-about-online-dating-in-the-u-s/).

    Args:
        sex (str): "Male" or "Female."
        consider_dating_population (bool, optional): Indicates whether the age probabilities are calibrated using the
            online dating distributions. Defaults to True.

    Returns:
        int: Age group ID (1: 18-24, 2: 25-29, 3: 30-34, 4: 35-39, 5: 40-44, 6: 45-49, 7: 50-54, 8: 55-59, 9: 60-64,
            10: 65-69, 11: 70-74, 12: 75-79, 13: 80+).
    """

    sex_age = {
        "Male": {
            "1": 0.07398741365778994,
            "2": 0.05641430611817123,
            "3": 0.06279522089469297,
            "4": 0.06805496653553862,
            "5": 0.07010719351866747,
            "6": 0.06602223814607372,
            "7": 0.07659535056034084,
            "8": 0.08438503872908167,
            "9": 0.0991113515937663,
            "10": 0.10595535796980643,
            "11": 0.09738572605450832,
            "12": 0.06916638637438274,
            "13": 0.07001944984717977,
        },
        "Female": {
            "1": 0.05094192542549045,
            "2": 0.045112814516478285,
            "3": 0.055974189077995756,
            "4": 0.06307652332077433,
            "5": 0.06738556147416742,
            "6": 0.06490407518080638,
            "7": 0.0776536312849162,
            "8": 0.0844917933393963,
            "9": 0.10471179247325797,
            "10": 0.1098393313412152,
            "11": 0.10174526871941449,
            "12": 0.07937724654627344,
            "13": 0.09478584729981378,
        },
    }

    online_dating_weights = {
        "1": 0.53,
        "2": 0.53,
        "3": 0.37,
        "4": 0.37,
        "5": 0.37,
        "6": 0.37,
        "7": 0.20,
        "8": 0.20,
        "9": 0.20,
        "10": 0.13,
        "11": 0.13,
        "12": 0.13,
        "13": 0.13,
    }

    age_probs = sex_age[sex]
    if consider_dating_population:
        for age in age_probs.keys():
            age_probs[age] *= online_dating_weights[age]

    return int(
        random.choices(
            population=list(age_probs.keys()), weights=age_probs.values(), k=1
        )[0]
    )




[docs]
def sample_height_from_sex_age(sex, age_group):
    """Samples height from the provided sex and age group (based on LLCP2022:
    https://www.cdc.gov/brfss/annual_data/annual_2022.html).

    Args:
        sex (str): "Male" or "Female."
        age_group (str|int): Age group ID (1: 18-24, 2: 25-29, 3: 30-34, 4: 35-39, 5: 40-44, 6: 45-49, 7: 50-54,
            8: 55-59, 9: 60-64, 10: 65-69, 11: 70-74, 12: 75-79, 13: 80+).

    Returns:
        int: Rounded sampled height.
    """
    if sex == "Male":
        sex = 1
    else:  # Female
        sex = 2
    age = int(age_group)

    dataset = LLCP2022_DATA

    kde = scipy.stats.gaussian_kde(
        dataset[
            (dataset["_SEX"] == sex)
            & (dataset["_AGEG5YR"] == age)
            & (~dataset["HTM4"].isnull())
        ]["HTM4"].tolist()
    )
    return int(kde.resample(1)[0][0])




[docs]
def sample_age_preference(sex, age_group, allowed_diff=3):
    """Samples age preference for a given sex and age group (based on the 2017 US Current Population Survey:
    https://www2.census.gov/programs-surveys/demo/tables/families/2017/cps-2017/tabfg3-all.xls), along with the
    user-provided allowed maximum difference using age group IDs.

    Args:
        sex (str): "Male" or "Female."
        age_group (str|int): Age group ID (1: 18-24, 2: 25-29, 3: 30-34, 4: 35-39, 5: 40-44, 6: 45-49, 7: 50-54,
            8: 55-59, 9: 60-64, 10: 65-69, 11: 70-74, 12: 75-79, 13: 80+).
        allowed_diff (int, optional): Allowed maximum difference using age group IDs. Defaults to 3.

    Returns:
        list: Age preference range (minimum, maximum).
    """

    observed_differences = {
        "5": 0.01,  # Husband 20+ years older than wife
        "4": 0.014,  # Husband 15-19 years older than wife
        "3": 0.047,  # Husband 10-14 years older than wife
        "2": 0.115,  # Husband 6-9 years older than wife
        "1": 0.122,  # Husband 4-5 years older than wife
        "0": 0.617,  # Husband 2-3 years older than wife / Husband and wife within 1 year / Wife 2-3 years older than husband
        "-1": 0.032,  # Wife 4-5 years older than husband
        "-2": 0.027,  # Wife 6-9 years older than husband
        "-3": 0.009,  # Wife 10-14 years older than husband
        "-4": 0.003,  # Wife 15-19 years older than husband
        "-5": 0.004,  # Wife 20+ years older than husband
    }

    age = age_group

    random_diff = int(
        random.choices(
            population=list(observed_differences.keys()),
            weights=observed_differences.values(),
            k=1,
        )[0]
    )
    if random_diff == 0:
        preference = [
            max(1, math.ceil(age - allowed_diff / 2)),
            min(13, math.floor(age + allowed_diff / 2)),
        ]
        return preference
    elif random_diff > 0:  # Older-male
        if sex == "Male":
            pref_min = max(1, age - random_diff)
            pref_max = max(age, pref_min + allowed_diff)
        else:  # Female
            pref_max = min(13, age + random_diff)
            pref_min = min(age, pref_max - allowed_diff)
    else:  # Older-female
        if sex == "Male":
            pref_max = min(13, age - random_diff)
            pref_min = min(age, pref_max - allowed_diff)
        else:  # Female
            pref_min = max(1, age + random_diff)
            pref_max = max(age, pref_min + allowed_diff)

    preference = [pref_min, pref_max]
    return preference




[docs]
def get_height_preference(gender, height):
    """Deterministically obtains a height preference range for a given gender and height. Uses the height range from
    LLCP2022 data.

    Args:
        gender (str): Agent gender.
        height (int): Agent height.

    Returns:
        list: Height preference range, as [min, max].
    """
    allowed_height_range = LLCP2022_HEIGHT_RANGE

    if gender == "Male":
        return [
            max(height - 30, allowed_height_range[0]),
            min(height + 10, allowed_height_range[1]),
        ]
    else:  # Female
        return [
            max(height + 5, allowed_height_range[0]),
            min(height + 50, allowed_height_range[1]),
        ]




[docs]
def get_bmi_preference(gender, bmi):
    """Deterministically obtains a weight preference.

    Args:
        gender (str): Agent gender.
        bmi (int): Agent BMI group ID.

    Returns:
        list: BMI preference range, as [min, max].
    """
    if gender == "Male":
        return [1, min(4, bmi)]
    else:  # Female
        if bmi > 1:
            return [max(2, bmi - 1), min(4, bmi + 1)]
        else:
            return [bmi, bmi + 1]