Skip to content

NLP Intro

Autor: Laura Moldovan, AIIS x Nitro AI Workshops 2025

Slide-uri: NLP Slides

data = [
    "Welcome to Nitro AI's workshop, hosted by AIIS conference on Natural Language Processing!",
    "Natural Language Processing, or NLP, enables computers to understand human language.",
    "Text preprocessing is a crucial step in NLP pipelines.",
]

print("Original Data:")
for text in data:
    print(f"- {text}")
Original Data:
- Welcome to Nitro AI's workshop, hosted by AIIS conference on Natural Language Processing!
- Natural Language Processing, or NLP, enables computers to understand human language.
- Text preprocessing is a crucial step in NLP pipelines.

Lowercase

data_lower = [text.lower() for text in data]
print("\nLowercase Data:")
for text in data_lower:
  print(f"- {text}")
Lowercase Data:
- welcome to nitro ai's workshop, hosted by aiis conference on natural language processing!
- natural language processing, or nlp, enables computers to understand human language.
- text preprocessing is a crucial step in nlp pipelines.

Tokenization

import nltk
from nltk.tokenize import word_tokenize
import numpy as np

nltk.download('punkt_tab')
data_tokens = [word_tokenize(text) for text in data_lower]
print("\nTokenized Data:")
for tokens in data_tokens:
  print(f"- {tokens}")
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Tokenized Data:
- ['welcome', 'to', 'nitro', 'ai', "'s", 'workshop', ',', 'hosted', 'by', 'aiis', 'conference', 'on', 'natural', 'language', 'processing', '!']
- ['natural', 'language', 'processing', ',', 'or', 'nlp', ',', 'enables', 'computers', 'to', 'understand', 'human', 'language', '.']
- ['text', 'preprocessing', 'is', 'a', 'crucial', 'step', 'in', 'nlp', 'pipelines', '.']

Eliminarea punctuatiei

data_no_punctuation = [[word for word in tokens if word.isalnum()] for tokens in data_tokens]
print("\nData without Punctuation:")
for tokens in data_no_punctuation:
  print(f"- {tokens}")
Data without Punctuation:
- ['welcome', 'to', 'nitro', 'ai', 'workshop', 'hosted', 'by', 'aiis', 'conference', 'on', 'natural', 'language', 'processing']
- ['natural', 'language', 'processing', 'or', 'nlp', 'enables', 'computers', 'to', 'understand', 'human', 'language']
- ['text', 'preprocessing', 'is', 'a', 'crucial', 'step', 'in', 'nlp', 'pipelines']

Remove stopwords

from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
data_no_stopwords = [[word for word in tokens if word not in stop_words] for tokens in data_no_punctuation]
print("\nData without Stopwords:")
for tokens in data_no_stopwords:
  print(f"- {tokens}")
Data without Stopwords:
- ['welcome', 'nitro', 'ai', 'workshop', 'hosted', 'aiis', 'conference', 'natural', 'language', 'processing']
- ['natural', 'language', 'processing', 'nlp', 'enables', 'computers', 'understand', 'human', 'language']
- ['text', 'preprocessing', 'crucial', 'step', 'nlp', 'pipelines']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

Stemming

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
data_stemmed = [[ stemmer.stem(word) for word in tokens] for tokens in data_no_stopwords]
print("\nStemmed Data:")
for tokens in data_stemmed:
  print(f"- {tokens}")
Stemmed Data:
- ['welcom', 'nitro', 'ai', 'workshop', 'host', 'aii', 'confer', 'natur', 'languag', 'process']
- ['natur', 'languag', 'process', 'nlp', 'enabl', 'comput', 'understand', 'human', 'languag']
- ['text', 'preprocess', 'crucial', 'step', 'nlp', 'pipelin']

Lemmatization

from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
data_lemmatized = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in data_no_stopwords]
print("\nLemmatized Data:")
for tokens in data_lemmatized:
  print(f"- {tokens}")
[nltk_data] Downloading package wordnet to /root/nltk_data...



Lemmatized Data:
- ['welcome', 'nitro', 'ai', 'workshop', 'hosted', 'aiis', 'conference', 'natural', 'language', 'processing']
- ['natural', 'language', 'processing', 'nlp', 'enables', 'computer', 'understand', 'human', 'language']
- ['text', 'preprocessing', 'crucial', 'step', 'nlp', 'pipeline']
data_cleaned = [" ".join(tokens) for tokens in data_lemmatized]
print("\n Cleaned Data:")
for text in data_cleaned:
  print(f"- {text}")
 Cleaned Data:
- welcome nitro ai workshop hosted aiis conference natural language processing
- natural language processing nlp enables computer understand human language
- text preprocessing crucial step nlp pipeline
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder
import numpy as np

vocab = sorted(set(word for sentence in data_cleaned for word in sentence.split()))

encoder= OneHotEncoder(sparse_output=False)
encoded_vocab = encoder.fit_transform(np.array(vocab).reshape(-1,1))

vocab_to_onehot = {word: encoded_vocab[i] for i, word in enumerate(vocab)}

sentence_encodings = []
for sentence  in data_cleaned:
  encoding = [vocab_to_onehot[word] for word in sentence.split() if word in vocab]
  sentence_encodings.append(encoding)

print("Vocabular: ", vocab)
print("\nPropozitie originala:", data_cleaned[0])
print("\nEncoding:")
for word, encoding in zip(data_cleaned[0].split(),sentence_encodings[0]):
  print(f"{word}: {encoding}")
Vocabular:  ['ai', 'aiis', 'computer', 'conference', 'crucial', 'enables', 'hosted', 'human', 'language', 'natural', 'nitro', 'nlp', 'pipeline', 'preprocessing', 'processing', 'step', 'text', 'understand', 'welcome', 'workshop']

Propozitie originala: welcome nitro ai workshop hosted aiis conference natural language processing

Encoding:
welcome: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
nitro: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
ai: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
workshop: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
hosted: [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
aiis: [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
conference: [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
natural: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
language: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
processing: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
print("Vocabular: ", vocab)
print("\nPropozitie originala:", data_cleaned[1])
print("\nEncoding:")
for word, encoding in zip(data_cleaned[1].split(),sentence_encodings[1]):
  print(f"{word}: {encoding}")
Vocabular:  ['ai', 'aiis', 'computer', 'conference', 'crucial', 'enables', 'hosted', 'human', 'language', 'natural', 'nitro', 'nlp', 'pipeline', 'preprocessing', 'processing', 'step', 'text', 'understand', 'welcome', 'workshop']

Propozitie originala: natural language processing nlp enables computer understand human language

Encoding:
natural: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
language: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
processing: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
nlp: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
enables: [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
computer: [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
understand: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
human: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
language: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Bag of Words

from sklearn.feature_extraction.text import CountVectorizer

vectorizer=CountVectorizer()
bow_matrix = vectorizer.fit_transform(data_cleaned)
print("Vocab:", vectorizer.get_feature_names_out())
print("Bow Matrix:")
print(bow_matrix.toarray())
Vocab: ['ai' 'aiis' 'computer' 'conference' 'crucial' 'enables' 'hosted' 'human'
 'language' 'natural' 'nitro' 'nlp' 'pipeline' 'preprocessing'
 'processing' 'step' 'text' 'understand' 'welcome' 'workshop']
Bow Matrix:
[[1 1 0 1 0 0 1 0 1 1 1 0 0 0 1 0 0 0 1 1]
 [0 0 1 0 0 1 0 1 2 1 0 1 0 0 1 0 0 1 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0]]
import pandas as pd

bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(bow_df)
   ai  aiis  computer  conference  crucial  ...  step  text  understand  welcome  workshop
0   1     1         0           1        0  ...     0     0           0        1         1
1   0     0         1           0        0  ...     0     0           1        0         0
2   0     0         0           0        1  ...     1     1           0        0         0

[3 rows x 20 columns]
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data_cleaned)
print("Vocab:", tfidf_vectorizer.get_feature_names_out())
print(tfidf_matrix.toarray())
Vocab: ['ai' 'aiis' 'computer' 'conference' 'crucial' 'enables' 'hosted' 'human'
 'language' 'natural' 'nitro' 'nlp' 'pipeline' 'preprocessing'
 'processing' 'step' 'text' 'understand' 'welcome' 'workshop']
[[0.338348   0.338348   0.         0.338348   0.         0.
  0.338348   0.         0.25732238 0.25732238 0.338348   0.
  0.         0.         0.25732238 0.         0.         0.
  0.338348   0.338348  ]
 [0.         0.         0.35248004 0.         0.         0.35248004
  0.         0.35248004 0.53614032 0.26807016 0.         0.26807016
  0.         0.         0.26807016 0.         0.         0.35248004
  0.         0.        ]
 [0.         0.         0.         0.         0.42339448 0.
  0.         0.         0.         0.         0.         0.32200242
  0.42339448 0.42339448 0.         0.42339448 0.42339448 0.
  0.         0.        ]]
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tfidf_matrix)
print(similarity_matrix)
[[1.         0.2759218  0.        ]
 [0.2759218  1.         0.08631924]
 [0.         0.08631924 1.        ]]
from textblob import TextBlob

for i, text in enumerate(data):
  sentiment = TextBlob(text).sentiment
  print(f"Sentiment for '{text}': Polarity={sentiment.polarity}, Subjectivity={sentiment.subjectivity}")
Sentiment for 'Welcome to Nitro AI's workshop, hosted by AIIS conference on Natural Language Processing!': Polarity=0.4625, Subjectivity=0.65
Sentiment for 'Natural Language Processing, or NLP, enables computers to understand human language.': Polarity=0.05, Subjectivity=0.25
Sentiment for 'Text preprocessing is a crucial step in NLP pipelines.': Polarity=0.0, Subjectivity=1.0

NLP Sentiment Classification

Autor: Laura Moldovan, AIIS x Nitro AI Workshops 2025

! kaggle datasets download -d kazanova/sentiment140
Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 98% 79.0M/80.9M [00:02<00:00, 41.7MB/s]
100% 80.9M/80.9M [00:02<00:00, 32.0MB/s]
from zipfile import ZipFile

dataset = 'sentiment140.zip'

with ZipFile(dataset, 'r') as zipped:
    zipped.extractall()
    print('Extracted all files from', dataset)
Extracted all files from sentiment140.zip
import pandas as pd
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords # and, the, a
from nltk.stem.porter import PorterStemmer # running - > run

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
twitter_sentiment_df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

print('Number of rows', twitter_sentiment_df.shape[0])
print('Number of columns', twitter_sentiment_df.shape[1])

twitter_sentiment_df.head()
Number of rows 1599999
Number of columns 6
0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D
0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton is upset that he can't update his Facebook by ...
1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus @Kenichan I dived many times for the ball. Man...
2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF my whole body feels itchy and like its on fire
3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli @nationwideclass no, it's not behaving at all....
4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf @Kwesidei not the whole crew
columns = ['target', 'id', 'date', 'flag', 'user', 'text']

twitter_sentiment_df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', names=columns)

print('Number of rows', twitter_sentiment_df.shape[0])
print('Number of columns', twitter_sentiment_df.shape[1])

twitter_sentiment_df.head()
Number of rows 1600000
Number of columns 6
target id date flag user text
0 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ @switchfoot http://twitpic.com/2y1zl - Awww, t...
1 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton is upset that he can't update his Facebook by ...
2 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus @Kenichan I dived many times for the ball. Man...
3 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF my whole body feels itchy and like its on fire
4 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli @nationwideclass no, it's not behaving at all....
twitter_sentiment_df['target']=twitter_sentiment_df['target'].replace(4,1)
twitter_sentiment_df.head()
target id date flag user text
0 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ @switchfoot http://twitpic.com/2y1zl - Awww, t...
1 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton is upset that he can't update his Facebook by ...
2 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus @Kenichan I dived many times for the ball. Man...
3 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF my whole body feels itchy and like its on fire
4 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli @nationwideclass no, it's not behaving at all....
if twitter_sentiment_df.isnull().sum().sum()==0:
    print('No null values')
else:
    print('Null values')
No null values
positive_count = 0
negative_count = 0
for value in twitter_sentiment_df['target']:
    if value==0:
      negative_count += 1
    elif value==1:
      positive_count +=1
    else:
      print("Valoare necunoscuta")

print("Number of positive tweets:", positive_count)
print("Number of negative tweets:", negative_count)
Number of positive tweets: 800000
Number of negative tweets: 800000
english_stopwords = set(stopwords.words('english'))

def stem_text(text):
  stemmed_text=re.sub('[^A-Za-z]',' ',text)
  stemmed_text=stemmed_text.lower()

  stemmed_text = stemmed_text.split()

  stemmed_tokens=[]
  for token in stemmed_text:
    if token not in english_stopwords:
      stemmed_token= PorterStemmer().stem(token)
      stemmed_tokens.append(stemmed_token)

  stemmed_text=stemmed_tokens
  stemmed_text=' '.join(stemmed_text)
  return stemmed_text
twitter_sentiment_df['stemmed_tweet']=twitter_sentiment_df['text'].apply(stem_text)
twitter_sentiment_df[['text','stemmed_tweet']].head()
text stemmed_tweet
0 @switchfoot http://twitpic.com/2y1zl - Awww, t... switchfoot http twitpic com zl awww bummer sho...
1 is upset that he can't update his Facebook by ... upset updat facebook text might cri result sch...
2 @Kenichan I dived many times for the ball. Man... kenichan dive mani time ball manag save rest g...
3 my whole body feels itchy and like its on fire whole bodi feel itchi like fire
4 @nationwideclass no, it's not behaving at all.... nationwideclass behav mad see
X = twitter_sentiment_df['stemmed_tweet'].values
Y = twitter_sentiment_df['target'].values

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)
vectorizer = TfidfVectorizer()

X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)
LogisticRegression(max_iter=1000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
X_train_predictions = model.predict(X_train)
training_accuracy = accuracy_score(Y_train, X_train_predictions)
print('Accuracy score on training data: {:.2f}%'.format(training_accuracy*100))
Accuracy score on training data: 79.87%
X_test_predictions=model.predict(X_test)
testing_accuracy=accuracy_score(Y_test,X_test_predictions)
print('Accuracy score on testing data: {:.2f}%'.format(testing_accuracy*100))
Accuracy score on testing data: 77.67%

Incheiere

Sper ca v-a placut acest tutorial. Pe paginile noastre o sa gasiti in continuare informatii despre viitoare cursuri, competiti si evenimente