0

I'm building a basic content based recommender system, I'm expected to make a user profile of N users who supposed bought N items, then using naive based, on the user profile to figure out which feature to keep. im having a hard time understand the process exactly.

I did try the following

`# %%
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer 
# %%
# Download the necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
# %%
df = pd.read_csv('marketing_sample_for_amazon_com-ecommerce__20200101_20200131__10k_data.csv')
# %%
df['Category'].head()
df.head()
# %%
#fonction mtaa preprocessing feha tokenisation w stemming
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

def preprocess_text(text):
    if pd.isna(text):
        return ''
    #nahina ponctuation w radina text miniscule
    text = text.lower().replace('[^\w\s]',' ')
    # Tokenization
    tokens = text.split()
    # nahina stop words
    tokens = [token for token in tokens if token not in stop_words]
    # Steming
    tokens = [stemmer.stem(token) for token in tokens]
    # lamina tokens f star wehed
    text = ' '.join(tokens)
    return text
# %%
# amlna li kolneh lfouk l kol ll features mtaena "mawjoud f data set"
df['About Product'] = df['About Product'].apply(preprocess_text)
df['Product Specification'] = df['Product Specification'].apply(preprocess_text)
df['Technical Details'] = df['Technical Details'].apply(preprocess_text)
# %%
# lamina features kol f column wahda (msh aaref shiha wla ghalta seelt madame abir tao nestana reponse)
df['general-description'] = df['About Product'] + ' ' + df['Product Specification'] + ' ' + df['Technical Details']
# %%
# declaration tf idf
vectorizer = TfidfVectorizer()

# kharejna l item profile mtaena sous forme tf idf
item_profile = vectorizer.fit_transform(df['general-description'])

print(item_profile.shape)
# %%
# Generate user profile with ratings
[tag:
>! user_profile_items = df.sample(n=5, random_state=1)
>! 
>! Define the columns to preprocess
>! text_features = ['About Product', 'Product Specification', 'Technical Details']
>! 
>! Apply the preprocessing function to the selected columns
>! for feature in text_features:
>! user_profile_items[feature] = user_profile_items[feature].apply(preprocess_text)
>! 
>! Combine the preprocessed text into a single column
>! user_profile_items['general-description'] = user_profile_items[text_features].apply(lambda x: ' '.join(x), axis=1)
>! # %%
>! Add user ratings column
>! user_ratings = [4, 5, 3, 2, 4]
>! user_profile_items['User Rating'] = user_ratings
>! 
>! Prepare data for classification
>! X_user = vectorizer.transform(user_profile_items['general-description'])
>! y_user = user_profile_items['User Rating']
>! 
>! Train Naive Bayes classifier
>! from sklearn.naive_bayes import MultinomialNB
>! clf = MultinomialNB()
>! clf.fit(item_profile, df['Star Rating'])
>! 
>! Predict user ratings
>! y_pred = clf.predict(X_user)
>! print('Predicted user ratings:', y_pred)]

`so this is it , the problem is that im only taking one user and i don't know how many users i need to input

mahdy
  • 1

0 Answers0