I have a dataset that consists of customers and their product purchases for an ecommerce company that sells clothes. Along with this data, I have application logs that show the customer’s interactions on the site. The data looks something like this:
import pandas as pd
data = {'customer_id':[369799, 103508, 294535, 222573, 204286, 254953, 268167, 56201, 168900, 96618],
'application_log':['web_pdp__click_main_banner web_pdp__click_prod',
'web_pdp__click_main_banner web_pdp__click_prod web_pdp__click_sub',
'web_home__click_main_banner web_home__click_prod',
'web_pdp__click_main_banner web_pdp__click_prod web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub',
'web_pdp__click_main_banner web_pdp__click_prod web_pdp__view_hero web_pdp__hover_index web_pdp__click_sub',
'web_pdp__click_main_banner web_pdp__click_prod web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub',
'web_pdp__click_main_banner web_pdp__click_prod web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub',
'web_pdp__click_main_banner web_pdp__click_prod web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub',
'web_pdp__click_main_banner web_pdp__click_prod web_pdp__click_sub',
'web_home__click_main_banner web_home__click_prod'],
'var_1':[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'var_2':[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'var_3':[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
'var_4':[0, 1, 0, 5, 1, 3, 6, 7, 1, 0],
'var_5':[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
'targets':[1, 1, 0, 1, 1, 1, 1, 1, 1, 1]}
data = pd.DataFrame(data)
out:
customer_id | application_log | var_1 | var_2 | var_3 | var_4 | var_5 | targets |
---|---|---|---|---|---|---|---|
369799 | web_pdp__click_main_banner web_pdp__click_prod | 0 | 0 | 1 | 0 | 0 | 1 |
103508 | web_pdp__click_main_banner web_pdp__click_prod web_pdp__click_sub | 0 | 0 | 1 | 1 | 0 | 1 |
294535 | web_home__click_main_banner web_home__click_prod | 0 | 0 | 1 | 0 | 0 | 0 |
222573 | web_pdp__click_main_banner web_pdp__click_prod web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub | 0 | 0 | 1 | 5 | 0 | 1 |
204286 | web_pdp__click_main_banner web_pdp__click_prod web_pdp__view_hero web_pdp__hover_index web_pdp__click_sub | 0 | 0 | 1 | 1 | 1 | 1 |
254953 | web_pdp__click_main_banner web_pdp__click_prod web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub | 0 | 0 | 1 | 3 | 0 | 1 |
268167 | web_pdp__click_main_banner web_pdp__click_prod web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub | 0 | 0 | 1 | 6 | 0 | 1 |
56201 | web_pdp__click_main_banner web_pdp__click_prod web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub web_pdp__click_sub | 0 | 0 | 1 | 7 | 0 | 1 |
168900 | web_pdp__click_main_banner web_pdp__click_prod web_pdp__click_sub | 0 | 0 | 1 | 1 | 0 | 1 |
96618 | web_home__click_main_banner web_home__click_prod | 0 | 0 | 1 | 0 | 0 | 1 |
I want to predict the probability of a customer making a subsequent purchase as denoted in the field “targets” above. I would like to do this in PyTorch using a “customer-as-a-text” paradigm, whereby the customer’s session logs are concatenated into discrete tokens and grouped into “customer-sentences”, which are then used to learn “customer-embeddings”. Similar to what’s being explained in this diagram below:
The diagram comes from the tutorial that I am trying to emulate, which mentions using doc2vec to generate embeddings and then concatenating those embeddings with the remaining tabular data. The problem is that I don’t quite understand how to do this in PyTorch. Specifically, I don’t know how to create the model nor the custom dataset required for PyTorch.
I would very much appreciate it if someone could provide me with code that takes this data and converts it into a PyTorch dataset, as well as code for a multi-modal model that uses an LSTM layer for the text features and then combines the text features with the remaining numerical features in a multi-layer perceptron to predict the probability of the target column.
I found one specific tutorial that does this, except it uses PyTorch Lightning, which is something I want to avoid (https://drivendata.co/blog/hateful-memes-benchmark).
I currently havent written any code for the actual model. But my current code for the dataset looks something like below, but I feel that I'm going about it all wrong, especially in terms of the text data:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
class Vocabulary:
"""
__init__ method is called by default as soon as an object of this class is initiated
we use this method to initiate our vocab dictionaries
"""
def __init__(self, freq_threshold, max_size):
"""
freq_threshold : the minimum times a word must occur in corpus to be treated in vocab
max_size : max source vocab size
"""
# initiate the index to token dict
self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
# initiate the token to index dict
self.stoi = {k: j for j, k in self.itos.items()}
self.freq_threshold = freq_threshold
self.max_size = max_size
"""
__len__ is used by dataloader later to create batches
"""
def __len__(self):
return len(self.itos)
"""
a simple tokenizer to split on space and converts the sentence to list of words
"""
@staticmethod
def tokenizer(text):
return [tok.lower().strip() for tok in text.split(" ")]
"""
build the vocab: create a dictionary mapping of index to string (itos) and string to index (stoi)
output ex. for stoi -> {'the':5, 'a':6, 'an':7}
"""
def build_vocabulary(self, sentence_list):
# calculate the frequencies of each word first to remove the words with freq < freq_threshold
frequencies = {} # init the freq dict
idx = 4 # index from which we want our dict to start. We already used 4 indexes for pad, start, end, unk
# calculate freq of words
for sentence in sentence_list:
for word in self.tokenizer(sentence):
if word not in frequencies.keys():
frequencies[word] = 1
else:
frequencies[word] += 1
# limit vocab by removing low freq words
frequencies = {k: v for k, v in frequencies.items() if v > self.freq_threshold}
# limit vocab to the max_size specified
if len(frequencies) > self.max_size - idx:
frequencies = dict(
sorted(frequencies.items(), key=lambda x: -x[1])[: self.max_size - idx]
) # idx =4 for pad, start, end , unk
# create vocab
for word in frequencies.keys():
self.stoi[word] = idx
self.itos[idx] = word
idx += 1
"""
convert the list of words to a list of corresponding indexes
"""
def numericalize(self, text):
tokenized_text = self.tokenizer(text)
numericalized_text = []
for token in tokenized_text:
if token in self.stoi.keys():
numericalized_text.append(self.stoi[token])
else: # out-of-vocab words are represented by UNK token index
numericalized_text.append(self.stoi["<UNK>"])
return numericalized_text
class MyDataset(Dataset):
def __init__(self, df, target, text):
x = df.drop([target, text], axis=1).values.astype(int)
self.x_text = df[text]
y = df[target].values.astype(int)
self.x_text_voc = Vocabulary(1, 100)
self.x_text_voc.build_vocabulary(self.x_text.tolist())
self.x_train = torch.tensor(x, dtype=torch.int64)
self.y_train = torch.tensor(y, dtype=torch.int64)
def __len__(self):
return len(self.y_train)
def __getitem__(self, idx):
self.text_vector = self.x_text[idx]
self.num_source = [self.x_text_voc.stoi["<SOS>"]]
self.num_source += self.x_text_voc.numericalize(self.text_vector)
self.num_source.append(self.x_text_voc.stoi["<EOS>"])
return self.x_train[idx], torch.tensor(self.num_source), self.y_train[idx]
class MyCollate:
def __init__(self, pad_idx):
self.pad_idx = pad_idx
# __call__: a default method
## First the obj is created using MyCollate(pad_idx) in data loader
## Then if obj(batch) is called -> __call__ runs by default
def __call__(self, batch):
# get all source indexed sentences of the batch
source = [item[0] for item in batch]
# pad them using pad_sequence method from pytorch.
source = pad_sequence(source, batch_first=False, padding_value=self.pad_idx)
# get all target indexed sentences of the batch
target = [item[1] for item in batch]
# pad them using pad_sequence method from pytorch.
target = pad_sequence(target, batch_first=False, padding_value=self.pad_idx)
return source, target
def get_train_loader(
dataset, batch_size, num_workers=0, shuffle=True, pin_memory=False
):
# get pad_idx for collate fn
pad_idx = dataset.x_text_voc.stoi["<PAD>"]
# define loader
loader = DataLoader(
dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=shuffle,
pin_memory=pin_memory,
collate_fn=MyCollate(pad_idx=pad_idx),
)
return loader
train = MyDataset(data, "targets", "application_log")