import re from collections import Counter from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from sklearn.feature_extraction.text import TfidfVectorizer
# Preprocessing lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words('english')) subtitle Evil Dead 2013 Blu ray 1080p Dual Audi...
def preprocess_text(text): tokens = word_tokenize(text.lower()) tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens] tokens = [token for token in tokens if token] tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words] return " ".join(tokens) import re from collections import Counter from nltk
# TF-IDF vectorizer = TfidfVectorizer() tfidf = vectorizer.fit_transform([preprocessed_text]) subtitle Evil Dead 2013 Blu ray 1080p Dual Audi...
preprocessed_text = preprocess_text(subtitle_text)
# Features features = tfidf.toarray()[0]