diff --git a/lessons/02_bag_of_words.ipynb b/lessons/02_bag_of_words.ipynb index cbc9046..67ebae3 100644 --- a/lessons/02_bag_of_words.ipynb +++ b/lessons/02_bag_of_words.ipynb @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "id": "9e4a3a0d-66f4-44e5-8dd6-5f441146014d", "metadata": { "scrolled": true, @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 69, "id": "21ed437f-9767-43b7-abc5-159aa4339a31", "metadata": {}, "outputs": [], @@ -75,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 70, "id": "f3862ffd-918f-4184-8c90-8a39a8a2a069", "metadata": {}, "outputs": [], @@ -104,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 71, "id": "4190e351-97b7-4c5b-866e-07aa6cbd42c2", "metadata": {}, "outputs": [], @@ -116,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 72, "id": "79acbaf2-6625-4abb-b50f-97ea54ba0d11", "metadata": {}, "outputs": [ @@ -290,7 +290,7 @@ "4 2015-02-24 11:14:45 -0800 NaN Pacific Time (US & Canada) " ] }, - "execution_count": 3, + "execution_count": 72, "metadata": {}, "output_type": "execute_result" } @@ -316,7 +316,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 73, "id": "a1faaf90-8c01-4d25-9468-90c01823f0d5", "metadata": {}, "outputs": [], @@ -334,7 +334,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 74, "id": "438830e6-1064-47fe-b578-a1ca693a0ed0", "metadata": {}, "outputs": [ @@ -369,13 +369,13 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 75, "id": "01955158-6954-447a-acb6-2989d02a49c3", "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -404,7 +404,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 76, "id": "428ddde7-af73-4eb6-92c9-041a1791ca59", "metadata": {}, "outputs": [ @@ -417,7 +417,7 @@ "Name: retweet_count, dtype: float64" ] }, - "execution_count": 7, + "execution_count": 76, "metadata": {}, "output_type": "execute_result" } @@ -439,7 +439,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 77, "id": "12aa9f2d-d655-494a-bb72-08ad973518f3", "metadata": {}, "outputs": [ @@ -519,7 +519,7 @@ "Virgin America 0.543544 0.456456" ] }, - "execution_count": 8, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } @@ -581,19 +581,20 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 78, "id": "21738b02-9ab9-4a61-b41f-ff75888aa747", "metadata": { "tags": [] }, "outputs": [], "source": [ - "from utils import placeholder" + "from utils import placeholder\n", + "import re" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 79, "id": "03569f0d-34ba-492d-aa1d-1dce9d34f792", "metadata": {}, "outputs": [], @@ -604,21 +605,21 @@ "def preprocess(text):\n", " '''Create a preprocess pipeline that cleans the tweet data.'''\n", " \n", - " # Step 1: Lowercase\n", - " text = ...\n", - "\n", - " # Step 2: Replace patterns with placeholders\n", - " text = ...\n", - "\n", + " # Step 1: Convert text to lowercase\n", + " text = text.lower()\n", + " \n", + " # Step 2: Replace patterns with placeholders (URLs, digits, hashtags, user handles)\n", + " text = placeholder(text)\n", + " \n", " # Step 3: Remove extra whitespace characters\n", - " text = ...\n", - "\n", + " text = re.sub(blankspace_pattern, blankspace_repl, text)\n", + " \n", " return text" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 80, "id": "8990cefd-5d04-46ba-ada2-29978c28cfe8", "metadata": {}, "outputs": [ @@ -628,7 +629,7 @@ "text": [ "lol @justinbeiber and @BillGates are like soo 2000 #yesterday #amiright saw it on https://twitter.com #yolo\n", "==================================================\n", - "lol USER and USER are like soo DIGIT HASHTAG HASHTAG saw it on URL HASHTAG\n" + "lol USER and USER are like soo DIGIT HASHTAG HASHTAG saw it on URL HASHTAG \n" ] } ], @@ -645,7 +646,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 81, "id": "a5f7bb6a-f064-48cc-b650-12c4ef2fbb88", "metadata": { "scrolled": true @@ -654,15 +655,15 @@ { "data": { "text/plain": [ - "0 USER plus you've added commercials to the expe...\n", - "1 USER it's really aggressive to blast obnoxious...\n", + "0 USER plus you've added commercials to the exp...\n", + "1 USER it's really aggressive to blast obnoxiou...\n", "2 USER and it's a really big bad thing about it\n", - "3 USER seriously would pay $ DIGIT a flight for ...\n", - "4 USER yes, nearly every time i fly vx this “ear...\n", + "3 USER seriously would pay $ DIGIT a flight for...\n", + "4 USER yes, nearly every time i fly vx this “ea...\n", "Name: text_processed, dtype: object" ] }, - "execution_count": 12, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } @@ -687,17 +688,17 @@ "metadata": {}, "source": [ "\n", - "# The Bag-of-Words Representation\n", + "# La Representación Bag-of-Words\n", "\n", - "The idea of bag-of-words (BoW), as the name suggests, is quite intuitive: we take a document and toss it in a bag. The action of \"throwing\" the document in a bag disregards the relative position between words, so what is \"in the bag\" is essentially \"an unsorted set of words\" [(Jurafsky & Martin, 2024)](https://web.stanford.edu/~jurafsky/slp3/ed3book.pdf). In return, we have a list of unique words and the frequency of each of them. \n", + "La idea de bag-of-words (BoW), como sugiere el nombre, es bastante intuitiva: tomamos un documento y lo arrojamos en una bolsa. La acción de \"arrojar\" el documento en una bolsa ignora la posición relativa entre las palabras, por lo que lo que queda \"en la bolsa\" es esencialmente \"un conjunto desordenado de palabras\" [(Jurafsky & Martin, 2024)](https://web.stanford.edu/~jurafsky/slp3/ed3book.pdf). A cambio, obtenemos una lista de palabras únicas y la frecuencia de cada una de ellas. \n", "\n", - "For example, as shown in the following illustration, the word \"coffee\" appears twice. \n", + "Por ejemplo, como se muestra en la siguiente ilustración, la palabra \"coffee\" aparece dos veces. \n", "\n", "\"BoW-Part2\"\n", "\n", - "With a bag-of-words representation, we make heavy use of word frequency but not too much of word order. \n", + "Con una representación bag-of-words, hacemos un uso intensivo de la frecuencia de las palabras, pero no tanto del orden en que aparecen. \n", "\n", - "In the context of sentiment analysis, the sentiment of a tweet is conveyed more strongly by specific words. For example, if a tweet contains the word \"happy,\" it likely conveys positive sentiment, but not always (e.g., \"not happy\" denotes the opposite sentiment). When these words come up more often, they'll probably more strongly convey the sentiment." + "En el contexto del análisis de sentimiento, el sentimiento de un tweet se transmite más fuertemente a través de palabras específicas. Por ejemplo, si un tweet contiene la palabra \"happy\", es probable que transmita un sentimiento positivo, aunque no siempre (por ejemplo, \"not happy\" denota el sentimiento opuesto). Cuando estas palabras aparecen con mayor frecuencia, probablemente transmitirán el sentimiento con más fuerza.\n" ] }, { @@ -707,13 +708,13 @@ "source": [ "## Document Term Matrix\n", "\n", - "Now let's implement the idea of bag-of-words. Before we dive deeper, let's step back for a moment. In practice, text analysis often involves handling many documents; from now on, we use the term **document** to represent a piece of text on which we perform analysis. It could be a phrase, a sentence, a tweet, or any other text—as long as it can be represented by a string, the length dosen't really matter. \n", + "Ahora implementemos la idea de bag-of-words. Antes de profundizar, retrocedamos un momento. En la práctica, el análisis de texto a menudo implica manejar múltiples documentos; de ahora en adelante, utilizaremos el término **document** para representar un fragmento de texto sobre el cual realizamos análisis. Puede ser una frase, una oración, un tweet o cualquier otro texto—mientras pueda representarse como una cadena de caracteres, su longitud no es realmente un problema. \n", "\n", - "Imagine we have four documents (i.e., the four phrases shown above), and we toss them all in the bag. Instead of a word-frequency list, we'd expect a document-term matrix (DTM) in return. In a DTM, the word list is the **vocabulary** (V) that holds all unique words occur across the documents. For each **document** (D), we count the number of occurence of each word in the vocabulary, and then plug the number into the matrix. In other words, the DTM we will construct is a $D \\times V$ matrix, where each row corresponds to a document, and each column corresponds to a token (or \"term\").\n", + "Imagina que tenemos cuatro documentos (es decir, las cuatro frases mostradas anteriormente) y los arrojamos todos en la bolsa. En lugar de obtener una lista de frecuencias de palabras, obtendremos una document-term matrix (DTM). En una DTM, la lista de palabras constituye el **vocabulary** (V), que contiene todas las palabras únicas que aparecen en los documentos. Para cada **document** (D), contamos la cantidad de veces que aparece cada palabra en el vocabulario y luego colocamos ese número en la matriz. En otras palabras, la DTM que construiremos es una matriz $D \\times V$, donde cada fila corresponde a un documento y cada columna a un token (o \"término\"). \n", "\n", - "The unique tokens in this set of documents, arranged in alphabetical order, form the columns. For each document, we mark the occurence of each word present in the document. The numerical representation for each document is a row in the matrix. For example, the first document, \"the coffee roaster,\" has the numerical representation $[0, 1, 0, 0, 0, 1, 1, 0]$.\n", + "Los tokens únicos en este conjunto de documentos, organizados en orden alfabético, forman las columnas. Para cada documento, marcamos la frecuencia de cada palabra presente en el documento. La representación numérica de cada documento es una fila en la matriz. Por ejemplo, el primer documento, \"the coffee roaster\", tiene la representación numérica $[0, 1, 0, 0, 0, 1, 1, 0]$. \n", "\n", - "Note that the left index column now displays these documents as text, but typically we would just assign an index to each of them. \n", + "Nota que la columna de índices a la izquierda muestra estos documentos como texto, pero típicamente solo se les asignaría un número de índice. \n", "\n", "$$\n", "\\begin{array}{c|cccccccccccc}\n", @@ -725,12 +726,12 @@ "\\end{array}\n", "$$\n", "\n", - "To create a DTM, we will use `CountVectorizer` from the package `sklearn`." + "Para crear una DTM, utilizaremos `CountVectorizer` del paquete `sklearn`.\n" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 82, "id": "cd2adf56-ba93-459d-8cfa-16ce8dc9284b", "metadata": {}, "outputs": [], @@ -743,11 +744,11 @@ "id": "4989781d-6b40-417a-be70-eeba05cd8a50", "metadata": {}, "source": [ - "The following illustration depicts the three-step workflow of creating a DTM with `CountVectorizr`.\n", + "La siguiente ilustración muestra el flujo de trabajo en tres pasos para crear una DTM con `CountVectorizer`.\n", "\n", "\"CountVectorizer\"\n", "\n", - "Let's walk through these steps with the toy example shown above." + "Repasemos estos pasos utilizando el ejemplo simple mostrado anteriormente." ] }, { @@ -755,12 +756,12 @@ "id": "34174034-46b9-43e2-a511-5972d378cb00", "metadata": {}, "source": [ - "### A Toy Example" + "### Un Ejemplo Sencillo\n" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 83, "id": "4da2bd3d-0460-4b5f-9b9e-02940db0d7ca", "metadata": {}, "outputs": [], @@ -777,14 +778,14 @@ "id": "dff7c1d3-fcee-4e20-b9a7-17306ebd5fc2", "metadata": {}, "source": [ - "The first step is to initialize a `CountVectorizer` object. Within the round paratheses, we can specify parameter settings if desired. Let's take a look at the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) and see what options are available. \n", + "El primer paso es inicializar un objeto `CountVectorizer`. Dentro de los paréntesis, podemos especificar parámetros de configuración si lo deseamos. Echemos un vistazo a la [documentación](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) para ver qué opciones están disponibles. \n", "\n", - "For now we can just leave it blank to use the default settings. " + "Por ahora, podemos dejarlo en blanco para usar la configuración predeterminada. " ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 84, "id": "9de3fe6a-9abf-4e11-aad1-e54c891567bb", "metadata": {}, "outputs": [], @@ -798,14 +799,14 @@ "id": "1b5a7d0d-0bfc-4fb9-8e5f-e91e39797fb5", "metadata": {}, "source": [ - "The second step is to `fit` this `CountVectorizer` object to the data, which means creating a vocabulary of tokens from the set of documents. Thirdly, we `transform` our data according to the \"fitted\" `CountVectorizer` object, which means taking each of the document and counting the occurrences of tokens according to the vocabulary established during the \"fitting\" step.\n", + "El segundo paso es aplicar `fit` al objeto `CountVectorizer` con los datos, lo que significa crear un vocabulario de tokens a partir del conjunto de documentos. Luego, en el tercer paso, usamos `transform` para procesar nuestros datos de acuerdo con el objeto `CountVectorizer` \"ajustado\". Esto implica tomar cada documento y contar la aparición de tokens según el vocabulario establecido durante el paso de \"ajuste\". \n", "\n", - "It may sound a bit complex but steps 2 and 3 can be done in one swoop using a `fit_transform` function." + "Puede sonar un poco complejo, pero los pasos 2 y 3 pueden realizarse en una sola operación utilizando la función `fit_transform`. " ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 85, "id": "da1bbad4-bb1a-4b92-9096-6e17558b4a42", "metadata": {}, "outputs": [], @@ -819,25 +820,25 @@ "id": "324d3b65-4e98-48bf-87d2-399457f4939c", "metadata": {}, "source": [ - "The return of `fit_transform` is supposed to be the DTM. \n", + "El resultado de `fit_transform` debería ser la DTM. \n", "\n", - "Let's take a look at it!" + "¡Echemos un vistazo! " ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 86, "id": "cb044001-8eb2-4489-b025-2d8e2d4bfee2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "<4x8 sparse matrix of type ''\n", - "\twith 9 stored elements in Compressed Sparse Row format>" + "" ] }, - "execution_count": 17, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } @@ -851,14 +852,14 @@ "id": "f9817b09-a806-42c4-9436-822cc27a38b9", "metadata": {}, "source": [ - "Apparently we've got a \"sparse matrix\"—a matrix that contains a lot of zeros. This makes sense. For each document, there are words that don't occur at all, and these are counted as zero in the DTM. This sparse matrix is stored in a \"Compressed Sparse Row\" format, a memory-saving format designed for handling sparse matrices. \n", + "Aparentemente, hemos obtenido una \"sparse matrix\", es decir, una matriz que contiene muchos ceros. Esto tiene sentido: en cada documento, hay palabras que no aparecen en absoluto, y estas se registran como ceros en la DTM. Esta matriz dispersa se almacena en un formato \"Compressed Sparse Row\", un formato optimizado para ahorrar memoria al manejar matrices dispersas. \n", "\n", - "Let's convert it to a dense matrix, where those zeros are probably represented, as in a numpy array." + "Convirtámosla en una matriz densa, donde esos ceros probablemente estén representados, como en un array de numpy. " ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 87, "id": "bb03a238-87d8-40c9-b20e-66e7c9b6576b", "metadata": {}, "outputs": [ @@ -871,7 +872,7 @@ " [0, 1, 0, 0, 0, 0, 0, 1]])" ] }, - "execution_count": 18, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } @@ -886,12 +887,12 @@ "id": "28b58a63-d7f6-4b9f-aadf-4d4fc7341336", "metadata": {}, "source": [ - "So this is our DTM! The matrix is the same as shown above. To make it more reader-friendly, let's convert it to a dataframe. The column names should be tokens in the vocabulary, which we can access with the `get_feature_names_out` function." + "¡Así que esta es nuestra DTM! La matriz es la misma que mostramos anteriormente. Para hacerla más fácil de leer, convirtámosla en un dataframe. Los nombres de las columnas deben ser los tokens del vocabulario, a los cuales podemos acceder con la función `get_feature_names_out`. " ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 88, "id": "714de5d3-e37d-4a19-9ade-3c6629e38d4e", "metadata": {}, "outputs": [ @@ -902,7 +903,7 @@ " 'time'], dtype=object)" ] }, - "execution_count": 19, + "execution_count": 88, "metadata": {}, "output_type": "execute_result" } @@ -914,7 +915,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 89, "id": "6a7729a2-ca2e-4de7-8795-74dfedb7a4d5", "metadata": {}, "outputs": [], @@ -929,12 +930,12 @@ "id": "781da407-f394-40f2-9d45-1fac39f02047", "metadata": {}, "source": [ - "Here it is! The DTM of our toy data is now a dataframe. The index of `test_dtm` corresponds to the position of each document in the `test` list. " + "¡Aquí está! La DTM de nuestros datos de ejemplo ahora es un dataframe. El índice de `test_dtm` corresponde a la posición de cada documento en la lista `test`. " ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 90, "id": "e41dd243-cd2e-43c3-80f8-5eaab6e64210", "metadata": {}, "outputs": [ @@ -1026,7 +1027,7 @@ "3 0 1 0 0 0 0 0 1" ] }, - "execution_count": 21, + "execution_count": 90, "metadata": {}, "output_type": "execute_result" } @@ -1040,20 +1041,20 @@ "id": "d59a03b4-94fa-4fe7-8f5d-7280e31b9bc4", "metadata": {}, "source": [ - "Hopefully this toy example provides a clear walkthrough of creating a DTM.\n", + "Esperamos que este ejemplo sencillo haya proporcionado una guía clara para crear una DTM.\n", "\n", - "Now it's time for our tweets data!\n", + "¡Ahora es el momento de trabajar con nuestros datos de tweets!\n", "\n", - "### DTM for Tweets\n", + "### DTM para Tweets\n", "\n", - "We'll begin by initializing a `CountVectorizer` object. In the following cell, we have included a few parameters that people often adjust. These parameters are currently set to their default values.\n", + "Comenzaremos inicializando un objeto `CountVectorizer`. En la siguiente celda, hemos incluido algunos parámetros que las personas ajustan con frecuencia. Estos parámetros están configurados actualmente con sus valores predeterminados.\n", "\n", - "When we construct a DTM, the default is to lowercase the input text. If nothing is provided for `stop_words`, the default is to keep them. The next three parameters are used to control the size of the vocabulary, which we'll return to in a minute." + "Cuando construimos una DTM, el valor predeterminado es convertir a minúsculas el texto de entrada. Si no se proporciona nada para `stop_words`, el valor predeterminado es mantenerlas. Los siguientes tres parámetros se usan para controlar el tamaño del vocabulario, sobre lo cual volveremos en un momento." ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 91, "id": "783e44a4-4a22-4290-b222-282b02c080dc", "metadata": {}, "outputs": [], @@ -1068,7 +1069,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 92, "id": "f85e76ea-bc54-4775-bcda-432a03d2c96f", "metadata": { "scrolled": true @@ -1077,11 +1078,11 @@ { "data": { "text/plain": [ - "<11541x8751 sparse matrix of type ''\n", - "\twith 191139 stored elements in Compressed Sparse Row format>" + "" ] }, - "execution_count": 23, + "execution_count": 92, "metadata": {}, "output_type": "execute_result" } @@ -1094,7 +1095,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 93, "id": "87119057-c78c-4eb2-a9d6-3e9f44e4c22b", "metadata": {}, "outputs": [ @@ -1107,10 +1108,10 @@ " ...,\n", " [0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0],\n", - " [0, 0, 0, ..., 0, 0, 0]])" + " [0, 0, 0, ..., 0, 0, 0]], shape=(11541, 8751))" ] }, - "execution_count": 24, + "execution_count": 93, "metadata": {}, "output_type": "execute_result" } @@ -1122,7 +1123,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 94, "id": "99322b85-1a15-46a5-bb80-bb5eaa6eeb7b", "metadata": {}, "outputs": [], @@ -1133,7 +1134,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 95, "id": "43620587-3795-4434-8f1f-145c81b93706", "metadata": {}, "outputs": [ @@ -1160,12 +1161,12 @@ "id": "2dd257d5-4244-436c-afe7-5688232caf8f", "metadata": {}, "source": [ - "If we leave the `CountVectorizer` to the default setting, the vocabulary size of the tweet data is 8751. " + "Si dejamos el `CountVectorizer` con la configuración predeterminada, el tamaño del vocabulario de los datos de los tweets es 8751. " ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 96, "id": "bb3604ec-d909-4238-9a3f-67e7d4ae2ac5", "metadata": {}, "outputs": [ @@ -1357,7 +1358,7 @@ "[5 rows x 8751 columns]" ] }, - "execution_count": 27, + "execution_count": 96, "metadata": {}, "output_type": "execute_result" } @@ -1371,14 +1372,14 @@ "id": "095d34e2-52f8-4419-b4c7-ed20dbd5df89", "metadata": {}, "source": [ - "Most of the tokens have zero occurences at least in the first five tweets. \n", + "La mayoría de los tokens tienen cero ocurrencias, al menos en los primeros cinco tweets. \n", "\n", - "Let's take a closer look at the DTM!" + "¡Echemos un vistazo más de cerca a la DTM! " ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 97, "id": "f432154a-eae0-4723-a797-55f3cfdd71c4", "metadata": {}, "outputs": [ @@ -1398,7 +1399,7 @@ "dtype: int64" ] }, - "execution_count": 28, + "execution_count": 97, "metadata": {}, "output_type": "execute_result" } @@ -1410,27 +1411,27 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 98, "id": "26c7f1c9-dd66-49f2-b337-01253da551d2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "_exact_ 1\n", - "mightmismybrosgraduation 1\n", - "midterm 1\n", - "midnite 1\n", - "midland 1\n", - "michelle 1\n", - "michele 1\n", - "michael 1\n", - "mhtt 1\n", - "mgmt 1\n", + "zones 1\n", + "accelerate 1\n", + "acc 1\n", + "acarl 1\n", + "yogurt 1\n", + "yoga 1\n", + "yikes 1\n", + "absurdity 1\n", + "absorber 1\n", + "absorb 1\n", "dtype: int64" ] }, - "execution_count": 29, + "execution_count": 98, "metadata": {}, "output_type": "execute_result" } @@ -1445,14 +1446,14 @@ "id": "5d230f79-e752-4e32-93db-4f013287f8e2", "metadata": {}, "source": [ - "It is not surprising to see \"user\" and \"digit\" to be among the most frequent tokens as we replaced each idiosyncratic one with these placeholders. The rest of the most frequent tokens are mostly stop words.\n", + "No es sorprendente ver que \"user\" y \"digit\" estén entre los tokens más frecuentes, ya que reemplazamos cada uno de los idiosincráticos con estos marcadores de posición. El resto de los tokens más frecuentes son principalmente palabras vacías (stop words).\n", "\n", - "Perhaps a more interesting pattern is to look for which token appears most in any given tweet:" + "Tal vez un patrón más interesante sea buscar qué token aparece más en cualquier tweet dado:" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 99, "id": "efb8f4d8-4c88-4155-a6c5-c72a5b4e8bb8", "metadata": {}, "outputs": [ @@ -1493,42 +1494,42 @@ " 6\n", " \n", " \n", - " 10572\n", + " 11007\n", " to\n", " 5\n", " \n", " \n", - " 8148\n", - " the\n", + " 5513\n", + " to\n", " 5\n", " \n", " \n", - " 10742\n", + " 7750\n", " to\n", " 5\n", " \n", " \n", - " 152\n", - " to\n", + " 10923\n", + " the\n", " 5\n", " \n", " \n", - " 5005\n", + " 4089\n", " to\n", " 5\n", " \n", " \n", - " 10923\n", - " the\n", + " 8134\n", + " to\n", " 5\n", " \n", " \n", - " 7750\n", - " to\n", + " 8148\n", + " the\n", " 5\n", " \n", " \n", - " 355\n", + " 557\n", " to\n", " 5\n", " \n", @@ -1540,17 +1541,17 @@ " token number\n", "3127 lt 6\n", "918 worst 6\n", - "10572 to 5\n", - "8148 the 5\n", - "10742 to 5\n", - "152 to 5\n", - "5005 to 5\n", - "10923 the 5\n", + "11007 to 5\n", + "5513 to 5\n", "7750 to 5\n", - "355 to 5" + "10923 the 5\n", + "4089 to 5\n", + "8134 to 5\n", + "8148 the 5\n", + "557 to 5" ] }, - "execution_count": 30, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } @@ -1575,14 +1576,14 @@ "id": "7cdac4ef-6b9d-4aad-9b24-c70f6c2eb8f0", "metadata": {}, "source": [ - "It looks like among all tweets, at most a token appears six times, and it is either the word \"It\" or the word \"worst.\" \n", + "Parece que, entre todos los tweets, como máximo un token aparece seis veces, y es ya sea la palabra \"It\" o la palabra \"worst.\"\n", "\n", - "Let's go back to our tweets dataframe and locate the 918th tweet." + "Volvamos a nuestro dataframe de tweets y ubiquemos el tweet número 918." ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 100, "id": "5e7cacd8-1fb3-4f0d-a744-4ee0994a089f", "metadata": {}, "outputs": [ @@ -1592,7 +1593,7 @@ "\"@united is the worst. Worst reservation policies. Worst costumer service. Worst worst worst. Congrats, @Delta you're not that bad!\"" ] }, - "execution_count": 31, + "execution_count": 100, "metadata": {}, "output_type": "execute_result" } @@ -1607,22 +1608,22 @@ "id": "3dba8e37-4880-4565-b6fc-7e7c96958f0f", "metadata": {}, "source": [ - "## Customize the `CountVectorizer`\n", + "## Personalizar el `CountVectorizer`\n", "\n", - "So far we've always used the default parameter setting to create our DTMs, but in many cases we may want to customize the `CountVectorizer` object. The purpose of doing so is to further filter out unnecessary tokens. In the example below, we tweak the following parameters:\n", + "Hasta ahora, siempre hemos utilizado la configuración predeterminada de parámetros para crear nuestras DTMs, pero en muchos casos, es posible que queramos personalizar el objeto `CountVectorizer`. El propósito de hacerlo es filtrar más a fondo los tokens innecesarios. En el ejemplo siguiente, ajustamos los siguientes parámetros:\n", "\n", - "- `stop_words = 'english'`: ignore English stop words \n", - "- `min_df = 2`: ignore words that don't occur at least twice\n", - "- `max_df = 0.95`: ignore words if they appear in more than 95\\% of the documents\n", + "- `stop_words = 'english'`: ignorar las palabras vacías en inglés\n", + "- `min_df = 2`: ignorar palabras que no ocurren al menos dos veces\n", + "- `max_df = 0.95`: ignorar palabras que aparecen en más del 95\\% de los documentos\n", "\n", - "🔔 **Question**: Let's pause for a minute to discuss whether it sounds reasonable to set these parameters! What do you think?\n", + "🔔 **Pregunta**: ¡Paremos un minuto para discutir si tiene sentido establecer estos parámetros! ¿Qué opinas?\n", "\n", - "Oftentimes, we are not interested in words whose frequencies are either too low or too high, so we use `min_df` and `max_df` to filter them out. Alternatively, we can define our vocabulary size as $N$ by setting `max_features`. In other words, we tell `CountVectorizer` to only consider the top $N$ most frequent tokens when constructing the DTM." + "A menudo, no estamos interesados en palabras cuya frecuencia es demasiado baja o demasiado alta, por lo que usamos `min_df` y `max_df` para filtrarlas. Alternativamente, podemos definir el tamaño de nuestro vocabulario como $N$ configurando `max_features`. En otras palabras, le decimos a `CountVectorizer` que solo considere los $N$ tokens más frecuentes al construir la DTM." ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 101, "id": "37a0a93e-9dd8-43dc-a82c-06a24bf02bc9", "metadata": {}, "outputs": [], @@ -1637,7 +1638,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 102, "id": "b53e5ecf-7be3-4915-9d11-fd3edb913400", "metadata": {}, "outputs": [], @@ -1657,12 +1658,12 @@ "id": "6d2e66bc-2eaa-4642-8848-74459948084b", "metadata": {}, "source": [ - "Our second DTM has a substantially smaller vocabulary compared to the first one." + "Nuestra segunda DTM tiene un vocabulario considerablemente más pequeño en comparación con la primera." ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 103, "id": "570fb598-fa81-4111-9e36-7172d8034713", "metadata": {}, "outputs": [ @@ -1682,7 +1683,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 104, "id": "d8deabb2-20eb-4047-b592-48cb1564fd2a", "metadata": {}, "outputs": [ @@ -1874,7 +1875,7 @@ "[5 rows x 4471 columns]" ] }, - "execution_count": 35, + "execution_count": 104, "metadata": {}, "output_type": "execute_result" } @@ -1888,12 +1889,12 @@ "id": "998fe2c3-ec90-4027-8c7f-417327a33a27", "metadata": {}, "source": [ - "The most frequent token list now includes words that make more sense to us, such as \"cancelled\" and \"service.\" " + "La lista de tokens más frecuentes ahora incluye palabras que tienen más sentido para nosotros, como \"cancelled\" y \"service.\"" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 105, "id": "ffa7bf4e-640b-49bc-b64b-721140f67f76", "metadata": {}, "outputs": [ @@ -1913,7 +1914,7 @@ "dtype: int64" ] }, - "execution_count": 36, + "execution_count": 105, "metadata": {}, "output_type": "execute_result" } @@ -1945,19 +1946,23 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 106, "id": "da610560-62c3-48ab-a1b2-25e0b589bc61", "metadata": {}, "outputs": [], "source": [ "# Import spaCy\n", "import spacy\n", + "import pandas as pd\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "# Cargar el modelo de spaCy\n", "nlp = spacy.load('en_core_web_sm')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 107, "id": "98ead266-30f3-48ad-bc51-c1685487f000", "metadata": { "scrolled": true @@ -1967,17 +1972,17 @@ "# Create a function to lemmatize text\n", "def lemmatize_text(text):\n", " '''Lemmatize the text input with spaCy annotations.'''\n", - "\n", + " \n", " # Step 1: Initialize an empty list to hold lemmas\n", - " lemma = ...\n", - "\n", + " lemma = []\n", + " \n", " # Step 2: Apply the nlp pipeline to input text\n", - " doc = ...\n", - "\n", + " doc = nlp(text)\n", + " \n", " # Step 3: Iterate over tokens in the text to get the token lemma\n", " for token in doc:\n", - " lemma.append(...)\n", - "\n", + " lemma.append(token.lemma_)\n", + " \n", " # Step 4: Join lemmas together into a single string\n", " text_lemma = ' '.join(lemma)\n", " \n", @@ -1994,7 +1999,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 108, "id": "742e82bb-5c42-4fa8-9101-5a0ea908db25", "metadata": {}, "outputs": [ @@ -2002,9 +2007,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "USER wow this just blew my mind\n", + " USER wow this just blew my mind\n", "==================================================\n", - "USER wow this just blow my mind\n" + " USER wow this just blow my mind\n" ] } ], @@ -2025,7 +2030,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 109, "id": "1ac128d2-1be5-4ef5-bb50-5b8d44ef8ee9", "metadata": {}, "outputs": [], @@ -2044,7 +2049,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 110, "id": "5f49d790-3c9d-4dc1-a5c9-72c306630412", "metadata": {}, "outputs": [ @@ -2215,7 +2220,7 @@ " \n", " \n", "\n", - "

5 rows × 3553 columns

\n", + "

5 rows × 3571 columns

\n", "" ], "text/plain": [ @@ -2233,10 +2238,10 @@ "3 0 0 0 0 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 0 0 0 0 \n", "\n", - "[5 rows x 3553 columns]" + "[5 rows x 3571 columns]" ] }, - "execution_count": 41, + "execution_count": 110, "metadata": {}, "output_type": "execute_result" } @@ -2262,7 +2267,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 111, "id": "9859eb04-dbd2-4fa0-9798-65ed7496c297", "metadata": {}, "outputs": [ @@ -2272,7 +2277,7 @@ "text": [ "(11541, 8751)\n", "(11541, 4471)\n", - "(11541, 3553)\n" + "(11541, 3571)\n" ] } ], @@ -2293,7 +2298,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 112, "id": "5745ca29-97ed-4fe1-81db-7e402c8da674", "metadata": {}, "outputs": [ @@ -2301,19 +2306,19 @@ "data": { "text/plain": [ "digit 6927\n", - "flight 4043\n", + "flight 3952\n", "hashtag 2633\n", - "thank 1455\n", + "thank 1454\n", "hour 1134\n", - "cancel 948\n", - "delay 937\n", - "service 937\n", + "cancel 951\n", + "service 939\n", + "delay 934\n", "customer 902\n", - "time 856\n", + "time 860\n", "dtype: int64" ] }, - "execution_count": 43, + "execution_count": 112, "metadata": {}, "output_type": "execute_result" } @@ -2325,7 +2330,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 113, "id": "16c63e6a-50c3-448a-9a56-a1d193cd6680", "metadata": {}, "outputs": [ @@ -2345,7 +2350,7 @@ "dtype: int64" ] }, - "execution_count": 44, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } @@ -2377,7 +2382,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 114, "id": "f5e32d8a-c42d-475f-aab4-21eca8b1aee8", "metadata": {}, "outputs": [], @@ -2387,7 +2392,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 115, "id": "d23916c1-5693-456c-b71d-6d9d78d1e2e4", "metadata": {}, "outputs": [], @@ -2402,18 +2407,18 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 116, "id": "7af5b342-ab18-4766-9561-e38e50cd1e9b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "<11541x3553 sparse matrix of type ''\n", - "\twith 88287 stored elements in Compressed Sparse Row format>" + "" ] }, - "execution_count": 47, + "execution_count": 116, "metadata": {}, "output_type": "execute_result" } @@ -2426,7 +2431,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 117, "id": "55e509c8-5402-4be0-9143-0e448fff7066", "metadata": {}, "outputs": [ @@ -2597,7 +2602,7 @@ " \n", " \n", "\n", - "

5 rows × 3553 columns

\n", + "

5 rows × 3571 columns

\n", "" ], "text/plain": [ @@ -2615,10 +2620,10 @@ "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", - "[5 rows x 3553 columns]" + "[5 rows x 3571 columns]" ] }, - "execution_count": 48, + "execution_count": 117, "metadata": {}, "output_type": "execute_result" } @@ -2657,7 +2662,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 118, "id": "995b511a-d448-4cfb-a6a0-22a465efd8a8", "metadata": {}, "outputs": [ @@ -2675,10 +2680,10 @@ "zone 3177\n", "zoom 3920\n", "zurich 10622\n", - "Length: 3553, dtype: int64" + "Length: 3571, dtype: int64" ] }, - "execution_count": 49, + "execution_count": 118, "metadata": {}, "output_type": "execute_result" } @@ -2698,17 +2703,17 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 119, "id": "09b222fb-ad8c-4767-a974-dd261370a06e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "918" + "np.int64(918)" ] }, - "execution_count": 50, + "execution_count": 119, "metadata": {}, "output_type": "execute_result" } @@ -2727,17 +2732,17 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 120, "id": "079ee0e0-476f-4236-ba8a-615ba7a0efe8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "\"USER is the worst. worst reservation policies. worst costumer service. worst worst worst. congrats, USER you're not that bad!\"" + "\" USER is the worst. worst reservation policies. worst costumer service. worst worst worst. congrats, USER you're not that bad!\"" ] }, - "execution_count": 51, + "execution_count": 120, "metadata": {}, "output_type": "execute_result" } @@ -2756,17 +2761,17 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 121, "id": "f809df1a-1178-4272-a415-42edb20173b2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "5945" + "np.int64(5945)" ] }, - "execution_count": 52, + "execution_count": 121, "metadata": {}, "output_type": "execute_result" } @@ -2777,17 +2782,17 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 122, "id": "8093b6a7-54ca-468a-9376-b3c0be0b6f9b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'USER cancelled flighted 😢'" + "' USER cancelled flighted 😢'" ] }, - "execution_count": 53, + "execution_count": 122, "metadata": {}, "output_type": "execute_result" } @@ -2820,34 +2825,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 123, "id": "2bfbf838-9ff6-48b8-ad5d-5e75304fe060", "metadata": {}, "outputs": [], "source": [ "# Complete the boolean masks \n", - "positive_index = tweets[...].index\n", - "negative_index = tweets[...].index" + "positive_index = tweets[tweets['airline_sentiment'] == 'positive'].index\n", + "negative_index = tweets[tweets['airline_sentiment'] == 'negative'].index" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 124, "id": "8c67ea1f-de9e-49a9-94f2-a3351446e364", "metadata": {}, "outputs": [], "source": [ "# Complete the following two lines\n", - "pos = tfidf.loc[...].mean().sort_values(...).head(...)\n", - "neg = tfidf.loc[...].mean().sort_values(...).head(...)" + "pos = tfidf.loc[positive_index].mean().sort_values(ascending=False).head(10)\n", + "neg = tfidf.loc[negative_index].mean().sort_values(ascending=False).head(10)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 125, "id": "f1e29043-8c78-4e41-81d2-b4552030b457", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "pos.plot(kind='barh', \n", " xlim=(0, 0.18),\n", @@ -2857,10 +2873,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 126, "id": "e8b25940-2372-4755-818e-f75e4d23daf9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "neg.plot(kind='barh', \n", " xlim=(0, 0.18),\n", @@ -2904,7 +2931,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 127, "id": "33413d63-87eb-489f-b374-3cfeaa51cf3c", "metadata": {}, "outputs": [], @@ -2923,7 +2950,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 128, "id": "64cec8b9-14d9-4897-9c02-cc89fcf7b3c6", "metadata": {}, "outputs": [], @@ -2944,10 +2971,22 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": null, "id": "d46de0b2-af00-4a1d-b4cd-31b96ce545d1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mEl kernel se bloqueó al ejecutar código en la celda actual o en una celda anterior. \n", + "\u001b[1;31mRevise el código de las celdas para identificar una posible causa del error. \n", + "\u001b[1;31mHaga clic aquí para obtener más información. \n", + "\u001b[1;31mVea Jupyter log para obtener más detalles." + ] + } + ], "source": [ "def fit_logistic_regression(X, y):\n", " '''Fits a logistic regression model to provided data.'''\n", @@ -2971,7 +3010,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": null, "id": "773963bd-6603-4fad-884b-09ce60afab18", "metadata": {}, "outputs": [], @@ -2982,7 +3021,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": null, "id": "e10d06c1-d884-45d4-a03d-dd5d40bf70aa", "metadata": {}, "outputs": [ @@ -3021,7 +3060,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "id": "6dcb6ef1-13b3-437e-813c-7118911847a4", "metadata": {}, "outputs": [], @@ -3040,7 +3079,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": null, "id": "3e63814e-9c0d-4f7a-a5e0-72cca2758d71", "metadata": {}, "outputs": [ @@ -3151,7 +3190,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": null, "id": "0d596bf7-753c-40cd-ac52-4a37163650ae", "metadata": {}, "outputs": [ @@ -3270,7 +3309,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": null, "id": "17b1223b-e5c1-4992-bb7e-0a99651c3729", "metadata": {}, "outputs": [ @@ -3297,7 +3336,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": null, "id": "159e00c6-8a9f-484f-aea2-853fd5512083", "metadata": {}, "outputs": [ @@ -3350,7 +3389,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -3364,7 +3403,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.1" } }, "nbformat": 4,