Private traits and attributes are predictable from digital records of human behavior

An implementation of the paper in Python. Likes for categories are randomly generated but the like category has been extracted manually.

  • Data has been downloaded from: https://www.propublica.org/datastore/dataset/facebook-ad-categories

    Kosinski et al (2013), Private traits and attributes are predictable from digital records of human behavior, PNAS

  • In [ ]:
    import numpy as np
    import pandas as pd
    from numpy.linalg import svd
    import matplotlib.pyplot as plt
    from sklearn.linear_model import LinearRegression, LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.decomposition import TruncatedSVD
    import random
    
    # Setting data
    np.random.seed(42)
    DATA_DIR = "fb-ad-groups-crowd-sourced.csv"
    df = pd.read_csv(DATA_DIR, engine="python")
    cols = df['name'].values
    print("Number of categories: {}".format(len(cols)))
    
    Number of categories: 6065
    
    • We will generate data for number_of_users with categories from fb-add-groups-crowd-sourced.csv
    In [ ]:
    def generate_target_variables(number_of_users=55000,target=['age', 'gender', 'political']):
      # for each target we will generate random data
      data = {"age":[], "gender":[], "political":[]}
    
      # for age (age ranged: 18 to 75)
      # regression
      data['age'] = [random.randint(18, 75) for x in range(1, number_of_users+1)]
    
      # for gender (m or f) ( 1 or 0)
      # classification 
      data['gender'] = [random.randint(0,1) for x in range(1, number_of_users+1)]
    
      # for political
      # classification (1 -> democratic, 0 -> republican)
      data['political'] = [random.randint(0,1) for x in range(1, number_of_users+1)]
    
      return data
    
    
    def generate_data(number_of_users=55000, cols=cols):
        """
            Generates random data consisting of user's likes and dislikes
            
            Arguments:
                number_of_users: int
            Returns:
                DataFrame
        """
        assert number_of_users <= len(cols), "Number of users and cols should be less or equal."
        index = ["User {}".format(i) for i in range(1, number_of_users+1)] # Number of user
            
        # generic categories
        cols = cols.tolist()
    
        # target variables
        data = {col: [] for col in cols}   
    
        # random liking or disliking ( 1 or 0)
        def like_or_not(): return random.randint(0,1)
        
        for col in cols:
    
          #print("Adding for {}".format(col))
          for i in range(1,number_of_users+1):
                data[col].append(like_or_not())
        
        print("Data generation complete.")
        return pd.DataFrame(data=data, index=index), index, cols
    
    
    def generated_reduced_df(df, index, number_of_users=55000):
      tr = dimen_reduce(df.values)
      # 100 components
      
      dimen_df = pd.DataFrame(data=tr, index=index)
      target = generate_target_variables(len(dimen_df))
    
      dimen_df['age'] = target['age']
      dimen_df['gender'] = target['gender']
      dimen_df['political'] = target['political']
    
      return dimen_df
    
    
    def dimen_reduce(values):
      reduced = TruncatedSVD(n_components=100)
      tr = reduced.fit_transform(values)
      return tr
    
    def split_dataset(df,test_size=0.2, stype="linear"):
      features_age, labels_age = df.drop(columns=['age']).values, df['age'].values
      features_gender, labels_gender = df.drop(columns=['gender']).values, df['gender'].values
      features_political, labels_political = df.drop(columns=['political']).values, df['political'].values
    
      if stype == 'linear':
        x_train, x_test, y_train, y_test = train_test_split(features_age, labels_age, random_state=42, test_size=test_size)
        return x_train, x_test, y_train, y_test
      if stype == 'clas_gender':
        x_train, x_test, y_train, y_test = train_test_split(features_gender, labels_gender, random_state=42, test_size=test_size)
        return x_train, x_test, y_train, y_test
      if stype == 'clas_pol':
        x_train, x_test, y_train, y_test = train_test_split(features_political, labels_political, random_state=42, test_size=test_size)
        return x_train, x_test, y_train, y_test
      
    def start(number_of_users=1000, cols=cols[:100]):
    
      # 1000 number of users and 1000 number of ad categories
      generated_df, index, cols = generate_data(number_of_users=number_of_users, cols=cols)
      d = generated_reduced_df(generated_df, index)
    
      # for linear (age)
      x_train_linear, x_test_linear, y_train_linear, y_test_linear = split_dataset(d, stype="linear")
    
      #for classification (gender)
      x_train_gender, x_test_gender, y_train_gender, y_test_gender = split_dataset(d, stype="clas_gender")
    
      # for political
      x_train_pol, x_test_pol, y_train_pol, y_test_pol = split_dataset(d, stype="clas_pol")
      print("x_train_linear: {}".format(x_train_linear.shape))
      print("x_train_gender: {}".format(x_train_gender.shape))
      print("x_train_pol: {}".format(x_train_pol))
      
      # for linear values
      lr = LinearRegression()
      lr.fit(x_train_linear, y_train_linear)
    
      #for gender and political preference
      lgr = LogisticRegression()
      lgr.fit(x_train_gender, y_train_gender)
    
      lgr2 = LogisticRegression()
      lgr2.fit(x_train_pol, y_train_pol)
    
      return lgr, lgr2, lr
    
    In [ ]:
    # 1000 number of users and 1000 number of ad categories
    generated_df, index, cols = generate_data(number_of_users=1000, cols=cols[:1000])
    
    Data generation complete.
    
    In [ ]:
    split_dataset(d, stype="linear")
    
    In [ ]:
    d = generated_reduced_df(generated_df, index)
    
    In [ ]:
    # for linear (age)
    x_train_linear, x_test_linear, y_train_linear, y_test_linear = split_dataset(d, stype="linear")
    
    #for classification (gender)
    x_train_gender, x_test_gender, y_train_gender, y_test_gender = split_dataset(d, stype="clas_gender")
    
    # for political
    x_train_pol, x_test_pol, y_train_pol, y_test_pol = split_dataset(d, stype="clas_pol")
    
    In [ ]:
    y_train_gender.shape
    
    Out[ ]:
    (800,)
    In [ ]:
    from sklearn.model_selection import cross_val_score
    
    In [ ]:
    cross_val_score(linear_reg, x_train_linear, y_train_linear, cv=5)
    
    Out[ ]:
    array([-0.13186307, -0.02997988, -0.1590985 , -0.16649672, -0.20874096])
    In [ ]:
     
    
    In [ ]:
    from sklearn.linear_model import LinearRegression
    
    In [ ]:
    lr = LinearRegression()
    lr.fit(x_train_linear, y_train_linear)
    
    Out[ ]:
    LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
    In [ ]:
    lr.score(x_train_linear, y_train_linear)
    
    Out[ ]:
    0.11396594235333246
    In [ ]:
    lr.score(x_test_linear, y_test_linear)
    
    Out[ ]:
    -0.13689786512868563
    In [ ]: