NLP Intro

Autor: Laura Moldovan, AIIS x Nitro AI Workshops 2025

data = [
    "Welcome to Nitro AI's workshop, hosted by AIIS conference on Natural Language Processing!",
    "Natural Language Processing, or NLP, enables computers to understand human language.",
    "Text preprocessing is a crucial step in NLP pipelines.",
]

print("Original Data:")
for text in data:
    print(f"- {text}")

Original Data:
- Welcome to Nitro AI's workshop, hosted by AIIS conference on Natural Language Processing!
- Natural Language Processing, or NLP, enables computers to understand human language.
- Text preprocessing is a crucial step in NLP pipelines.

Lowercase

data_lower = [text.lower() for text in data]
print("\nLowercase Data:")
for text in data_lower:
  print(f"- {text}")

Lowercase Data:
- welcome to nitro ai's workshop, hosted by aiis conference on natural language processing!
- natural language processing, or nlp, enables computers to understand human language.
- text preprocessing is a crucial step in nlp pipelines.

Tokenization

import nltk
from nltk.tokenize import word_tokenize
import numpy as np

nltk.download('punkt_tab')
data_tokens = [word_tokenize(text) for text in data_lower]
print("\nTokenized Data:")
for tokens in data_tokens:
  print(f"- {tokens}")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Tokenized Data:
- ['welcome', 'to', 'nitro', 'ai', "'s", 'workshop', ',', 'hosted', 'by', 'aiis', 'conference', 'on', 'natural', 'language', 'processing', '!']
- ['natural', 'language', 'processing', ',', 'or', 'nlp', ',', 'enables', 'computers', 'to', 'understand', 'human', 'language', '.']
- ['text', 'preprocessing', 'is', 'a', 'crucial', 'step', 'in', 'nlp', 'pipelines', '.']

Eliminarea punctuatiei

data_no_punctuation = [[word for word in tokens if word.isalnum()] for tokens in data_tokens]
print("\nData without Punctuation:")
for tokens in data_no_punctuation:
  print(f"- {tokens}")

Data without Punctuation:
- ['welcome', 'to', 'nitro', 'ai', 'workshop', 'hosted', 'by', 'aiis', 'conference', 'on', 'natural', 'language', 'processing']
- ['natural', 'language', 'processing', 'or', 'nlp', 'enables', 'computers', 'to', 'understand', 'human', 'language']
- ['text', 'preprocessing', 'is', 'a', 'crucial', 'step', 'in', 'nlp', 'pipelines']

Remove stopwords

from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
data_no_stopwords = [[word for word in tokens if word not in stop_words] for tokens in data_no_punctuation]
print("\nData without Stopwords:")
for tokens in data_no_stopwords:
  print(f"- {tokens}")

Data without Stopwords:
- ['welcome', 'nitro', 'ai', 'workshop', 'hosted', 'aiis', 'conference', 'natural', 'language', 'processing']
- ['natural', 'language', 'processing', 'nlp', 'enables', 'computers', 'understand', 'human', 'language']
- ['text', 'preprocessing', 'crucial', 'step', 'nlp', 'pipelines']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

Stemming

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
data_stemmed = [[ stemmer.stem(word) for word in tokens] for tokens in data_no_stopwords]
print("\nStemmed Data:")
for tokens in data_stemmed:
  print(f"- {tokens}")

Stemmed Data:
- ['welcom', 'nitro', 'ai', 'workshop', 'host', 'aii', 'confer', 'natur', 'languag', 'process']
- ['natur', 'languag', 'process', 'nlp', 'enabl', 'comput', 'understand', 'human', 'languag']
- ['text', 'preprocess', 'crucial', 'step', 'nlp', 'pipelin']

Lemmatization

from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
data_lemmatized = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in data_no_stopwords]
print("\nLemmatized Data:")
for tokens in data_lemmatized:
  print(f"- {tokens}")

[nltk_data] Downloading package wordnet to /root/nltk_data...



Lemmatized Data:
- ['welcome', 'nitro', 'ai', 'workshop', 'hosted', 'aiis', 'conference', 'natural', 'language', 'processing']
- ['natural', 'language', 'processing', 'nlp', 'enables', 'computer', 'understand', 'human', 'language']
- ['text', 'preprocessing', 'crucial', 'step', 'nlp', 'pipeline']

data_cleaned = [" ".join(tokens) for tokens in data_lemmatized]
print("\n Cleaned Data:")
for text in data_cleaned:
  print(f"- {text}")

 Cleaned Data:
- welcome nitro ai workshop hosted aiis conference natural language processing
- natural language processing nlp enables computer understand human language
- text preprocessing crucial step nlp pipeline

# One-hot encoding
from sklearn.preprocessing import OneHotEncoder
import numpy as np

vocab = sorted(set(word for sentence in data_cleaned for word in sentence.split()))

encoder= OneHotEncoder(sparse_output=False)
encoded_vocab = encoder.fit_transform(np.array(vocab).reshape(-1,1))

vocab_to_onehot = {word: encoded_vocab[i] for i, word in enumerate(vocab)}

sentence_encodings = []
for sentence  in data_cleaned:
  encoding = [vocab_to_onehot[word] for word in sentence.split() if word in vocab]
  sentence_encodings.append(encoding)

print("Vocabular: ", vocab)
print("\nPropozitie originala:", data_cleaned[0])
print("\nEncoding:")
for word, encoding in zip(data_cleaned[0].split(),sentence_encodings[0]):
  print(f"{word}: {encoding}")

Vocabular:  ['ai', 'aiis', 'computer', 'conference', 'crucial', 'enables', 'hosted', 'human', 'language', 'natural', 'nitro', 'nlp', 'pipeline', 'preprocessing', 'processing', 'step', 'text', 'understand', 'welcome', 'workshop']

Propozitie originala: welcome nitro ai workshop hosted aiis conference natural language processing

Encoding:
welcome: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
nitro: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
ai: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
workshop: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
hosted: [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
aiis: [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
conference: [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
natural: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
language: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
processing: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]

print("Vocabular: ", vocab)
print("\nPropozitie originala:", data_cleaned[1])
print("\nEncoding:")
for word, encoding in zip(data_cleaned[1].split(),sentence_encodings[1]):
  print(f"{word}: {encoding}")

Vocabular:  ['ai', 'aiis', 'computer', 'conference', 'crucial', 'enables', 'hosted', 'human', 'language', 'natural', 'nitro', 'nlp', 'pipeline', 'preprocessing', 'processing', 'step', 'text', 'understand', 'welcome', 'workshop']

Propozitie originala: natural language processing nlp enables computer understand human language

Encoding:
natural: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
language: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
processing: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
nlp: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
enables: [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
computer: [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
understand: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
human: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
language: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Bag of Words

from sklearn.feature_extraction.text import CountVectorizer

vectorizer=CountVectorizer()
bow_matrix = vectorizer.fit_transform(data_cleaned)

print("Vocab:", vectorizer.get_feature_names_out())
print("Bow Matrix:")
print(bow_matrix.toarray())

Vocab: ['ai' 'aiis' 'computer' 'conference' 'crucial' 'enables' 'hosted' 'human'
 'language' 'natural' 'nitro' 'nlp' 'pipeline' 'preprocessing'
 'processing' 'step' 'text' 'understand' 'welcome' 'workshop']
Bow Matrix:
[[1 1 0 1 0 0 1 0 1 1 1 0 0 0 1 0 0 0 1 1]
 [0 0 1 0 0 1 0 1 2 1 0 1 0 0 1 0 0 1 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0]]

import pandas as pd

bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(bow_df)

   ai  aiis  computer  conference  crucial  ...  step  text  understand  welcome  workshop
0   1     1         0           1        0  ...     0     0           0        1         1
1   0     0         1           0        0  ...     0     0           1        0         0
2   0     0         0           0        1  ...     1     1           0        0         0

[3 rows x 20 columns]

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data_cleaned)

print("Vocab:", tfidf_vectorizer.get_feature_names_out())
print(tfidf_matrix.toarray())

Vocab: ['ai' 'aiis' 'computer' 'conference' 'crucial' 'enables' 'hosted' 'human'
 'language' 'natural' 'nitro' 'nlp' 'pipeline' 'preprocessing'
 'processing' 'step' 'text' 'understand' 'welcome' 'workshop']
[[0.338348   0.338348   0.         0.338348   0.         0.
  0.338348   0.         0.25732238 0.25732238 0.338348   0.
  0.         0.         0.25732238 0.         0.         0.
  0.338348   0.338348  ]
 [0.         0.         0.35248004 0.         0.         0.35248004
  0.         0.35248004 0.53614032 0.26807016 0.         0.26807016
  0.         0.         0.26807016 0.         0.         0.35248004
  0.         0.        ]
 [0.         0.         0.         0.         0.42339448 0.
  0.         0.         0.         0.         0.         0.32200242
  0.42339448 0.42339448 0.         0.42339448 0.42339448 0.
  0.         0.        ]]

from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tfidf_matrix)
print(similarity_matrix)

[[1.         0.2759218  0.        ]
 [0.2759218  1.         0.08631924]
 [0.         0.08631924 1.        ]]

from textblob import TextBlob

for i, text in enumerate(data):
  sentiment = TextBlob(text).sentiment
  print(f"Sentiment for '{text}': Polarity={sentiment.polarity}, Subjectivity={sentiment.subjectivity}")

Sentiment for 'Welcome to Nitro AI's workshop, hosted by AIIS conference on Natural Language Processing!': Polarity=0.4625, Subjectivity=0.65
Sentiment for 'Natural Language Processing, or NLP, enables computers to understand human language.': Polarity=0.05, Subjectivity=0.25
Sentiment for 'Text preprocessing is a crucial step in NLP pipelines.': Polarity=0.0, Subjectivity=1.0

NLP Sentiment Classification

Autor: Laura Moldovan, AIIS x Nitro AI Workshops 2025

! kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 98% 79.0M/80.9M [00:02<00:00, 41.7MB/s]
100% 80.9M/80.9M [00:02<00:00, 32.0MB/s]

from zipfile import ZipFile

dataset = 'sentiment140.zip'

with ZipFile(dataset, 'r') as zipped:
    zipped.extractall()
    print('Extracted all files from', dataset)

Extracted all files from sentiment140.zip

import pandas as pd
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords # and, the, a
from nltk.stem.porter import PorterStemmer # running - > run

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

twitter_sentiment_df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

print('Number of rows', twitter_sentiment_df.shape[0])
print('Number of columns', twitter_sentiment_df.shape[1])

twitter_sentiment_df.head()

Number of rows 1599999
Number of columns 6

	1467810369	Mon Apr 06 22:19:45 PDT 2009	NO_QUERY	_TheSpecialOne_	@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D
0	1467810672	Mon Apr 06 22:19:49 PDT 2009	NO_QUERY	scotthamilton	is upset that he can't update his Facebook by ...
1	1467810917	Mon Apr 06 22:19:53 PDT 2009	NO_QUERY	mattycus	@Kenichan I dived many times for the ball. Man...
2	1467811184	Mon Apr 06 22:19:57 PDT 2009	NO_QUERY	ElleCTF	my whole body feels itchy and like its on fire
3	1467811193	Mon Apr 06 22:19:57 PDT 2009	NO_QUERY	Karoli	@nationwideclass no, it's not behaving at all....
4	1467811372	Mon Apr 06 22:20:00 PDT 2009	NO_QUERY	joy_wolf	@Kwesidei not the whole crew

columns = ['target', 'id', 'date', 'flag', 'user', 'text']

twitter_sentiment_df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', names=columns)

print('Number of rows', twitter_sentiment_df.shape[0])
print('Number of columns', twitter_sentiment_df.shape[1])

twitter_sentiment_df.head()

Number of rows 1600000
Number of columns 6

	id	date	flag	user	text
0	1467810369	Mon Apr 06 22:19:45 PDT 2009	NO_QUERY	_TheSpecialOne_	@switchfoot http://twitpic.com/2y1zl - Awww, t...
1	1467810672	Mon Apr 06 22:19:49 PDT 2009	NO_QUERY	scotthamilton	is upset that he can't update his Facebook by ...
2	1467810917	Mon Apr 06 22:19:53 PDT 2009	NO_QUERY	mattycus	@Kenichan I dived many times for the ball. Man...
3	1467811184	Mon Apr 06 22:19:57 PDT 2009	NO_QUERY	ElleCTF	my whole body feels itchy and like its on fire
4	1467811193	Mon Apr 06 22:19:57 PDT 2009	NO_QUERY	Karoli	@nationwideclass no, it's not behaving at all....

twitter_sentiment_df['target']=twitter_sentiment_df['target'].replace(4,1)
twitter_sentiment_df.head()

	id	date	flag	user	text
0	1467810369	Mon Apr 06 22:19:45 PDT 2009	NO_QUERY	_TheSpecialOne_	@switchfoot http://twitpic.com/2y1zl - Awww, t...
1	1467810672	Mon Apr 06 22:19:49 PDT 2009	NO_QUERY	scotthamilton	is upset that he can't update his Facebook by ...
2	1467810917	Mon Apr 06 22:19:53 PDT 2009	NO_QUERY	mattycus	@Kenichan I dived many times for the ball. Man...
3	1467811184	Mon Apr 06 22:19:57 PDT 2009	NO_QUERY	ElleCTF	my whole body feels itchy and like its on fire
4	1467811193	Mon Apr 06 22:19:57 PDT 2009	NO_QUERY	Karoli	@nationwideclass no, it's not behaving at all....

if twitter_sentiment_df.isnull().sum().sum()==0:
    print('No null values')
else:
    print('Null values')

No null values

positive_count = 0
negative_count = 0
for value in twitter_sentiment_df['target']:
    if value==0:
      negative_count += 1
    elif value==1:
      positive_count +=1
    else:
      print("Valoare necunoscuta")

print("Number of positive tweets:", positive_count)
print("Number of negative tweets:", negative_count)

Number of positive tweets: 800000
Number of negative tweets: 800000

english_stopwords = set(stopwords.words('english'))

def stem_text(text):
  stemmed_text=re.sub('[^A-Za-z]',' ',text)
  stemmed_text=stemmed_text.lower()

  stemmed_text = stemmed_text.split()

  stemmed_tokens=[]
  for token in stemmed_text:
    if token not in english_stopwords:
      stemmed_token= PorterStemmer().stem(token)
      stemmed_tokens.append(stemmed_token)

  stemmed_text=stemmed_tokens
  stemmed_text=' '.join(stemmed_text)
  return stemmed_text

twitter_sentiment_df['stemmed_tweet']=twitter_sentiment_df['text'].apply(stem_text)

twitter_sentiment_df[['text','stemmed_tweet']].head()

	text	stemmed_tweet
0	@switchfoot http://twitpic.com/2y1zl - Awww, t...	switchfoot http twitpic com zl awww bummer sho...
1	is upset that he can't update his Facebook by ...	upset updat facebook text might cri result sch...
2	@Kenichan I dived many times for the ball. Man...	kenichan dive mani time ball manag save rest g...
3	my whole body feels itchy and like its on fire	whole bodi feel itchi like fire
4	@nationwideclass no, it's not behaving at all....	nationwideclass behav mad see

X = twitter_sentiment_df['stemmed_tweet'].values
Y = twitter_sentiment_df['target'].values

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

vectorizer = TfidfVectorizer()

X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)

LogisticRegression(max_iter=1000)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

X_train_predictions = model.predict(X_train)
training_accuracy = accuracy_score(Y_train, X_train_predictions)

print('Accuracy score on training data: {:.2f}%'.format(training_accuracy*100))

Accuracy score on training data: 79.87%

X_test_predictions=model.predict(X_test)
testing_accuracy=accuracy_score(Y_test,X_test_predictions)

print('Accuracy score on testing data: {:.2f}%'.format(testing_accuracy*100))

Accuracy score on testing data: 77.67%

Incheiere

Sper ca v-a placut acest tutorial. Pe paginile noastre o sa gasiti in continuare informatii despre viitoare cursuri, competiti si evenimente