<img src='https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQ-VfNtOyJbsaxu43Kztf_cv1mgBG6ZIQZEVw&usqp=CAU'>

# Procesamiento de Lenguage Natural




Datos: Libre elección

Expectativas:
- Pre-procesamiento del texto
- Uso de Word2Vec (Consejo: jugar con los parámetros)
- Mostrar las palabras más parecidas (`most_similar`) de tres palabras que le llamen la atención
- Responder:
    - ¿Su modelo da buenos resultados? ¿Por qué sí o por qué no?
    - ¿Qué problemas encontró al realizar este taller?


Bonus: 
- Usar una función que no hayamos visto en clase ([Aquí](https://radimrehurek.com/gensim/models/word2vec.html#module-gensim.models.word2vec))
- Visualizar el modelo usando PCA

In [1]:
#Análisis de noticias falsas
#preprocesamiento
import re
import pandas as pd
from nltk.corpus import stopwords
stopwords_sp = stopwords.words('spanish')

def pre_procesado(texto):
     texto = texto.lower()
     texto = re.sub(r"[\W\d_]+", " ", texto)
     texto = " ".join([palabra for palabra in texto.split() if len(palabra)>2]) 
     texto = " ".join([palabra for palabra in texto.split() if palabra not in stopwords_sp])
     return texto.split()

fake = pd.read_csv("desktop/archivos/onlyfakes1000.csv")
fake['pp'] = fake['text'].apply(lambda texto: pre_procesado(texto))

fake.head()

Unnamed: 0,text,pp
0,El suceso ha tenido lugar en Brasil. Un adole...,"[suceso, lugar, brasil, adolescente, años, mur..."
1,Estamos en la semana decisiva. Los expertos a...,"[semana, decisiva, expertos, aseguran, campaña..."
2,Estudios científicos hay muchos. Unos nos int...,"[estudios, científicos, interesan, concreto, h..."
3,Ha sucedido en la ciudad de San José de Río P...,"[sucedido, ciudad, san, josé, río, preto, bras..."
4,La fiesta en Sevilla por el vuelco electoral ...,"[fiesta, sevilla, vuelco, electoral, alargó, c..."


In [2]:
import gensim.models.word2vec as w2v

In [3]:
%%time
 
mi_modelo = w2v.Word2Vec(fake['pp'].values,
                          sg=1, # 1 skip-gram
                          seed=1, # semilla
                          size=200, # número de dimensiones
                          min_count=5,
                          window=10)

Wall time: 516 ms


In [4]:
#Palabras más parecidas
mi_modelo.wv.most_similar("menas")

[('dicen', 0.9997541904449463),
 ('nunca', 0.9997429847717285),
 ('amigos', 0.9997411966323853),
 ('ayudar', 0.9997411370277405),
 ('orden', 0.999740481376648),
 ('haciendo', 0.9997392296791077),
 ('comunidad', 0.9997387528419495),
 ('embargo', 0.9997382760047913),
 ('sino', 0.9997377395629883),
 ('haber', 0.9997376203536987)]

In [5]:
mi_modelo.wv.most_similar("prensa")

[('localidad', 0.9997602701187134),
 ('confirmado', 0.9997563362121582),
 ('algún', 0.9997508525848389),
 ('formación', 0.9997497797012329),
 ('hecho', 0.999744713306427),
 ('siempre', 0.9997433423995972),
 ('toda', 0.9997411370277405),
 ('hora', 0.9997395873069763),
 ('nombre', 0.9997395277023315),
 ('sexual', 0.9997395277023315)]

In [6]:
mi_modelo.wv.most_similar("corte")

[('viernes', 0.99973464012146),
 ('algún', 0.9997328519821167),
 ('caso', 0.9997080564498901),
 ('gracias', 0.9997074007987976),
 ('ministerio', 0.9997058510780334),
 ('nueva', 0.9997047781944275),
 ('hizo', 0.9996985197067261),
 ('podría', 0.999698281288147),
 ('hace', 0.9996973872184753),
 ('casi', 0.9996916055679321)]

In [7]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = mi_modelo.wv.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{0} es a {1}, lo que {2} es a {3}".format(start1, end1, start2, end2))

In [8]:
nearest_similarity_cosmul("menas", "extranjeros", "inmigrantes")

menas es a extranjeros, lo que ningún es a inmigrantes


- ¿Su modelo da buenos resultados? ¿Por qué sí o por qué no?
El modelo tiene un buen resultado al traer palabras parecidas, sin embargo para hacer analogías el corpus no es suficiente y es muy difícil encontrar temas que puedan generar buenas analogías
- ¿Qué problemas encontró al realizar este taller?
Decidir los mejores parámetros para el modelo y poder definir cual era el mejor modelo

Visualizar el modelo

In [9]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import iplot

In [10]:
vocab = [e[0] for e in mi_modelo.wv.most_similar("ley")]+[e[0] for e in mi_modelo.wv.most_similar("ministerio")]

print(vocab)

['congreso', 'noticia', 'algún', 'vuelta', 'estancia', 'murcia', 'varones', 'pues', 'decreto', 'haber', 'caso', 'hecho', 'próximo', 'niño', 'calle', 'mena', 'medida', 'organización', 'decidió', 'públicas']


In [11]:
X = mi_modelo[mi_modelo.wv.vocab]

matrix = pd.DataFrame(X)

matrix.index = mi_modelo.wv.vocab.keys()
 
matrix


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
suceso,0.006212,0.031511,0.045481,0.101416,-0.031679,0.150899,-0.059783,-0.014200,0.107744,0.176405,...,0.050822,-0.017535,0.042748,-0.073751,-0.088236,-0.076950,-0.062377,-0.055633,-0.060737,-0.058302
lugar,0.005950,0.040947,0.059403,0.136897,-0.040842,0.201386,-0.075965,-0.015665,0.138248,0.228637,...,0.065123,-0.018993,0.060566,-0.094583,-0.112451,-0.098903,-0.081829,-0.071580,-0.077870,-0.076109
brasil,0.005165,0.026123,0.037262,0.089919,-0.026611,0.130936,-0.052012,-0.013126,0.092490,0.145507,...,0.043393,-0.012164,0.036452,-0.062901,-0.074967,-0.062631,-0.050171,-0.049755,-0.051648,-0.048988
años,0.008973,0.040707,0.061044,0.140177,-0.041050,0.201412,-0.080918,-0.014251,0.143369,0.234809,...,0.066531,-0.022884,0.063267,-0.096038,-0.120391,-0.097196,-0.080598,-0.076261,-0.079407,-0.077067
después,0.004588,0.037437,0.059342,0.144849,-0.044886,0.210980,-0.083985,-0.015445,0.150787,0.239436,...,0.070495,-0.024690,0.062590,-0.098726,-0.118127,-0.101004,-0.084490,-0.078070,-0.085354,-0.076749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mercadona,0.001592,0.029226,0.041898,0.102322,-0.031841,0.150310,-0.054796,-0.013188,0.104230,0.167622,...,0.050685,-0.017119,0.044884,-0.071129,-0.086456,-0.073510,-0.060821,-0.052663,-0.058199,-0.055017
marruecos,0.005181,0.024268,0.038444,0.091734,-0.027209,0.133286,-0.050361,-0.009693,0.092897,0.150116,...,0.046088,-0.014870,0.037095,-0.065604,-0.077990,-0.062612,-0.051981,-0.048857,-0.052151,-0.049877
italia,0.003858,0.030171,0.042547,0.097440,-0.031347,0.146198,-0.056130,-0.010863,0.105478,0.166581,...,0.048054,-0.015512,0.045370,-0.071881,-0.082600,-0.072842,-0.061193,-0.052945,-0.057551,-0.053230
usted,0.003297,0.032344,0.048360,0.108048,-0.034448,0.159390,-0.064360,-0.015801,0.112862,0.186113,...,0.054664,-0.019894,0.049417,-0.078136,-0.090361,-0.079543,-0.064764,-0.060393,-0.062716,-0.057396


In [12]:
pca = PCA(n_components=2)
result = pca.fit_transform(matrix)
result = pd.DataFrame(result)
result.columns = ['X', 'Y']
result['Palabra'] = matrix.index.values
result

Unnamed: 0,X,Y,Palabra
0,0.137783,0.002006,suceso
1,-0.185469,0.002270,lugar
2,0.297975,-0.001699,brasil
3,-0.208962,-0.002381,años
4,-0.250687,-0.002270,después
...,...,...,...
931,0.167823,-0.001028,mercadona
932,0.273000,0.000120,marruecos
933,0.175010,0.001343,italia
934,0.075135,0.001296,usted


In [13]:
trace = go.Scatter(x=result['X'].values,
                    y=result['Y'].values,
                    text=result['Palabra'].values,
                    mode='markers') 
 
layout = go.Layout(title="PCA mi modelo")
 
fig = go.Figure(data=trace, layout=layout)

iplot(fig)