An implementation of the paper in Python. Likes for categories are randomly generated but the like category has been extracted manually.
Kosinski et al (2013), Private traits and attributes are predictable from digital records of human behavior, PNAS
import numpy as np
import pandas as pd
from numpy.linalg import svd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
import random
# Setting data
np.random.seed(42)
DATA_DIR = "fb-ad-groups-crowd-sourced.csv"
df = pd.read_csv(DATA_DIR, engine="python")
cols = df['name'].values
print("Number of categories: {}".format(len(cols)))
def generate_target_variables(number_of_users=55000,target=['age', 'gender', 'political']):
# for each target we will generate random data
data = {"age":[], "gender":[], "political":[]}
# for age (age ranged: 18 to 75)
# regression
data['age'] = [random.randint(18, 75) for x in range(1, number_of_users+1)]
# for gender (m or f) ( 1 or 0)
# classification
data['gender'] = [random.randint(0,1) for x in range(1, number_of_users+1)]
# for political
# classification (1 -> democratic, 0 -> republican)
data['political'] = [random.randint(0,1) for x in range(1, number_of_users+1)]
return data
def generate_data(number_of_users=55000, cols=cols):
"""
Generates random data consisting of user's likes and dislikes
Arguments:
number_of_users: int
Returns:
DataFrame
"""
assert number_of_users <= len(cols), "Number of users and cols should be less or equal."
index = ["User {}".format(i) for i in range(1, number_of_users+1)] # Number of user
# generic categories
cols = cols.tolist()
# target variables
data = {col: [] for col in cols}
# random liking or disliking ( 1 or 0)
def like_or_not(): return random.randint(0,1)
for col in cols:
#print("Adding for {}".format(col))
for i in range(1,number_of_users+1):
data[col].append(like_or_not())
print("Data generation complete.")
return pd.DataFrame(data=data, index=index), index, cols
def generated_reduced_df(df, index, number_of_users=55000):
tr = dimen_reduce(df.values)
# 100 components
dimen_df = pd.DataFrame(data=tr, index=index)
target = generate_target_variables(len(dimen_df))
dimen_df['age'] = target['age']
dimen_df['gender'] = target['gender']
dimen_df['political'] = target['political']
return dimen_df
def dimen_reduce(values):
reduced = TruncatedSVD(n_components=100)
tr = reduced.fit_transform(values)
return tr
def split_dataset(df,test_size=0.2, stype="linear"):
features_age, labels_age = df.drop(columns=['age']).values, df['age'].values
features_gender, labels_gender = df.drop(columns=['gender']).values, df['gender'].values
features_political, labels_political = df.drop(columns=['political']).values, df['political'].values
if stype == 'linear':
x_train, x_test, y_train, y_test = train_test_split(features_age, labels_age, random_state=42, test_size=test_size)
return x_train, x_test, y_train, y_test
if stype == 'clas_gender':
x_train, x_test, y_train, y_test = train_test_split(features_gender, labels_gender, random_state=42, test_size=test_size)
return x_train, x_test, y_train, y_test
if stype == 'clas_pol':
x_train, x_test, y_train, y_test = train_test_split(features_political, labels_political, random_state=42, test_size=test_size)
return x_train, x_test, y_train, y_test
def start(number_of_users=1000, cols=cols[:100]):
# 1000 number of users and 1000 number of ad categories
generated_df, index, cols = generate_data(number_of_users=number_of_users, cols=cols)
d = generated_reduced_df(generated_df, index)
# for linear (age)
x_train_linear, x_test_linear, y_train_linear, y_test_linear = split_dataset(d, stype="linear")
#for classification (gender)
x_train_gender, x_test_gender, y_train_gender, y_test_gender = split_dataset(d, stype="clas_gender")
# for political
x_train_pol, x_test_pol, y_train_pol, y_test_pol = split_dataset(d, stype="clas_pol")
print("x_train_linear: {}".format(x_train_linear.shape))
print("x_train_gender: {}".format(x_train_gender.shape))
print("x_train_pol: {}".format(x_train_pol))
# for linear values
lr = LinearRegression()
lr.fit(x_train_linear, y_train_linear)
#for gender and political preference
lgr = LogisticRegression()
lgr.fit(x_train_gender, y_train_gender)
lgr2 = LogisticRegression()
lgr2.fit(x_train_pol, y_train_pol)
return lgr, lgr2, lr
# 1000 number of users and 1000 number of ad categories
generated_df, index, cols = generate_data(number_of_users=1000, cols=cols[:1000])
split_dataset(d, stype="linear")
d = generated_reduced_df(generated_df, index)
# for linear (age)
x_train_linear, x_test_linear, y_train_linear, y_test_linear = split_dataset(d, stype="linear")
#for classification (gender)
x_train_gender, x_test_gender, y_train_gender, y_test_gender = split_dataset(d, stype="clas_gender")
# for political
x_train_pol, x_test_pol, y_train_pol, y_test_pol = split_dataset(d, stype="clas_pol")
y_train_gender.shape
from sklearn.model_selection import cross_val_score
cross_val_score(linear_reg, x_train_linear, y_train_linear, cv=5)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train_linear, y_train_linear)
lr.score(x_train_linear, y_train_linear)
lr.score(x_test_linear, y_test_linear)