From 718defb272d11c87d34143e6b5793cabff76cbe9 Mon Sep 17 00:00:00 2001 From: edcrfv458 Date: Mon, 17 Jun 2024 01:52:15 +0900 Subject: [PATCH 1/3] =?UTF-8?q?Add:=20Test=20=EB=AA=A8=EB=8D=B8=20?= =?UTF-8?q?=EC=83=9D=EC=84=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 기존의 LSTM_attention 모델 코드에서는 학습의 변수들이 너무 커서 오래 걸림, 그래서 테스트를 위해 변수들을 작게 해서 모델을 학습 시켜 봄 --- AI/LSTM_attention_test.ipynb | 1 + 1 file changed, 1 insertion(+) create mode 100644 AI/LSTM_attention_test.ipynb diff --git a/AI/LSTM_attention_test.ipynb b/AI/LSTM_attention_test.ipynb new file mode 100644 index 0000000..a68fab1 --- /dev/null +++ b/AI/LSTM_attention_test.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"gpuType":"T4","authorship_tag":"ABX9TyNByVSUfJ94Bpl8oocV6J3f"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ix1Jbg_xWTxg","executionInfo":{"status":"ok","timestamp":1718542427282,"user_tz":-540,"elapsed":19043,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"121c7e9e-b3b8-4f2b-f10c-dd800d4676c7"},"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","source":["import pandas as pd\n","import numpy as np\n","import os\n","import json\n","import csv\n","\n","TL_sentence_path = '/content/drive/MyDrive/LSTM+attention/sentence_dataTL.csv'\n","VL_sentence_path = '/content/drive/MyDrive/LSTM+attention/sentence_dataVL.csv'\n","\n","# data파일 불러오기\n","TL_sentence_data = pd.read_csv(TL_sentence_path, encoding='utf-8')\n","VL_sentence_data = pd.read_csv(VL_sentence_path, encoding='utf-8')\n","\n","# 중복 제거, Pronuncication 열은 필요 없다고 생각\n","TL_sentence_data.drop('Pronunciation', axis=1, inplace=True)\n","TL_sentence_data = TL_sentence_data.drop_duplicates().reset_index(drop=True)\n","VL_sentence_data.drop('Pronunciation', axis=1, inplace=True)\n","VL_sentence_data = VL_sentence_data.drop_duplicates().reset_index(drop=True)"],"metadata":{"id":"xPCQBU1BWfcw","executionInfo":{"status":"ok","timestamp":1718542436382,"user_tz":-540,"elapsed":7833,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["TL_sentence_data[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"UZGjs0aPXrCe","executionInfo":{"status":"ok","timestamp":1718542436382,"user_tz":-540,"elapsed":3,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"928eece9-4d05-4c04-90db-287d3dd7f8b9"},"execution_count":3,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Dialect \\\n","0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴 \n","1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴 \n","2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴 \n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까 \n","4 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네 \n","\n"," Standard \n","0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까 \n","2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까 \n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까 \n","4 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네 "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
DialectStandard
0여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까
1장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까
2예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까
3음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까
4이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"TL_sentence_data[:5]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Dialect\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\uc7a5\\ub840 \\uad00\\ub828\\ud574\\uc11c \\ucd08\\uc0c1\\uc9d1 \\uac19\\uc740 \\ub370 \\uac00\\uac00 \\ud558\\uc9c0 \\ub9d0\\uc544\\uc57c \\ub370\\ub294 \\uae30 \\uc788\\uc2b5\\ub2c8\\uaef4\",\n \"\\uc774 \\uad6c\\ub450 \\ud558\\ub098\\ub9cc \\uacc4\\uc18d \\uc2e0\\uace0 \\ub315\\uae30\\uc774\\uaebc\\ub124 \\uc778\\uc790 \\uad7d\\uc774 \\ub9ce\\uc774 \\ub2f3\\uc544\\uc11c \\uac08\\uc544\\uc57c \\ub418\\uaca0\\ub124\",\n \"\\uc608\\uc804\\uc5d0\\ub294 \\uc9d1 \\uc548\\uc5d0\\uc11c \\uc5ec\\uc790\\ub4e4\\uc774 \\ub0a8\\uc790 \\uc704\\ub85c \\ub760\\ub118\\uc73c\\uba74 \\uc548 \\ub374\\ub2e4 \\ucea4\\uc2b5\\ub2c8\\uaef4\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Standard\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\uc7a5\\ub840 \\uad00\\ub828\\ud574\\uc11c \\ucd08\\uc0c1\\uc9d1 \\uac19\\uc740 \\ub370 \\uac00\\uc11c \\ud558\\uc9c0 \\ub9d0\\uc544\\uc57c \\ub370\\ub294 \\uac8c \\uc788\\uc2b5\\ub2c8\\uae4c\",\n \"\\uc774 \\uad6c\\ub450 \\ud558\\ub098\\ub9cc \\uacc4\\uc18d \\uc2e0\\uace0 \\ub2e4\\ub2c8\\ub2c8\\uae4c \\uc774\\uc81c \\uad7d\\uc774 \\ub9ce\\uc774 \\ub2f3\\uc544\\uc11c \\uac08\\uc544\\uc57c \\ub418\\uaca0\\ub124\",\n \"\\uc608\\uc804\\uc5d0\\ub294 \\uc9d1 \\uc548\\uc5d0\\uc11c \\uc5ec\\uc790\\ub4e4\\uc774 \\ub0a8\\uc790 \\uc704\\ub85c \\ub6f0\\uc5b4\\ub118\\uc73c\\uba74 \\uc548 \\ub41c\\ub2e4 \\ud588\\uc2b5\\ub2c8\\uae4c\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":3}]},{"cell_type":"code","source":["VL_sentence_data[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"a0cWFdpxDKN7","executionInfo":{"status":"ok","timestamp":1718542436382,"user_tz":-540,"elapsed":2,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"f9a0098f-fb64-4747-dce7-1ec1462da2bb"},"execution_count":4,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Dialect \\\n","0 오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요 \n","1 혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다 \n","2 집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼 \n","3 아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이 \n","4 옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로 \n","\n"," Standard \n","0 오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요 \n","1 혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다 \n","2 집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼 \n","3 아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐 \n","4 옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠 "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
DialectStandard
0오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요
1혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다
2집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼
3아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐
4옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"VL_sentence_data[:5]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Dialect\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\ud608\\uc555\\uc57d\\uc740 \\uc2dc\\uac04\\uc744 \\ub9de\\ucdb0 \\ucc59\\uaca8 \\ub4dc\\uc154\\uc57c\\uc9c0 \\uc548 \\uadf8\\ub7ec\\uba74 \\ud6a8\\uacfc\\uac00 \\uc5c6\\uc2b5\\ub2c8\\ub2e4\",\n \"\\uc61b\\ub0a0\\ubd80\\ud130 \\uc870\\uc0c1\\uafc8\\uc774\\ub098 \\ub3fc\\uc9c0\\uafc8 \\uafb8\\ub9cc \\uc9d1\\uc5d0 \\ub3c8 \\ub9ce\\uc774 \\ub4e4\\uc5b4\\uc628\\ub2e4\\uace0 \\uc88b\\uc544 \\ud574\\uc9c0\\ub85c\",\n \"\\uc9d1\\uc5d0 \\ub3cc\\uc544\\uc640 \\ubcf4\\uc774\\uaebc\\ub124 \\ubb38\\uc774 \\uc5f4\\ub824 \\uc788\\uace0 \\ubf08\\ub2e4\\uc9c0\\uac00 \\uc5f4\\uc5b4\\ub454 \\ub3c8 \\uc804\\ubd80 \\uc5c6\\uc5b4\\uc9c0\\ub358 \\uc5b4\\uc774\\ub5bc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Standard\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\ud608\\uc555\\uc57d\\uc740 \\uc2dc\\uac04\\uc744 \\ub9de\\ucdb0 \\ucc59\\uaca8 \\ub4dc\\uc154\\uc57c\\uc9c0 \\uc548 \\uadf8\\ub7ec\\uba74 \\ud6a8\\uacfc\\uac00 \\uc5c6\\uc2b5\\ub2c8\\ub2e4\",\n \"\\uc61b\\ub0a0\\ubd80\\ud130 \\uc870\\uc0c1\\uafc8\\uc774\\ub098 \\ub3fc\\uc9c0\\uafc8 \\uafb8\\uba74 \\uc9d1\\uc5d0 \\ub3c8 \\ub9ce\\uc774 \\ub4e4\\uc5b4\\uc628\\ub2e4\\uace0 \\uc88b\\uc544 \\ud588\\uc8e0\",\n \"\\uc9d1\\uc5d0 \\ub3cc\\uc544\\uc640 \\ubcf4\\ub2c8\\uae4c \\ubb38\\uc774 \\uc5f4\\ub824 \\uc788\\uace0 \\uc11c\\ub78d\\uc774 \\uc5f4\\uc5b4\\ub454 \\ub3c8 \\uc804\\ubd80 \\uc5c6\\uc5b4\\uc9c0\\ub358 \\uc5b4\\uc774\\ub5bc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":4}]},{"cell_type":"code","source":["standard_sentences_TL = TL_sentence_data['Standard']\n","dialect_sentences_TL = TL_sentence_data['Dialect']\n","standard_sentences_VL = VL_sentence_data['Standard']\n","dialect_sentences_VL = VL_sentence_data['Dialect']"],"metadata":{"id":"jlxCy4d3WyDB","executionInfo":{"status":"ok","timestamp":1718542437775,"user_tz":-540,"elapsed":1,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":5,"outputs":[]},{"cell_type":"code","source":["standard_sentences_TL[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"aj-awCcGFReV","executionInfo":{"status":"ok","timestamp":1718542439773,"user_tz":-540,"elapsed":1,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"218f7804-9aff-4647-8c07-9632c99fc102"},"execution_count":6,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까\n","1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까\n","2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까\n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까\n","4 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네\n","Name: Standard, dtype: object"]},"metadata":{},"execution_count":6}]},{"cell_type":"code","source":["dialect_sentences_TL[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ctrb-c6hFWAX","executionInfo":{"status":"ok","timestamp":1718542440184,"user_tz":-540,"elapsed":1,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"1a040177-63e5-46dd-98d6-b0d2db7237fa"},"execution_count":7,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴\n","1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴\n","2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴\n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까\n","4 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네\n","Name: Dialect, dtype: object"]},"metadata":{},"execution_count":7}]},{"cell_type":"code","source":["# 학습 데이터 중에서 겹치는 표준어 문장과 방언 문장 제거\n","filtered_data_TR = {\n"," \"src\": [],\n"," \"tar\": []\n","}\n","\n","for i in range(0, len(dialect_sentences_TL)):\n"," if (standard_sentences_TL[i] != dialect_sentences_TL[i]):\n"," filtered_data_TR[\"src\"].append(dialect_sentences_TL[i])\n"," filtered_data_TR[\"tar\"].append(standard_sentences_TL[i])\n","\n","filtered_df_TR = pd.DataFrame(filtered_data_TR)\n","\n","print(filtered_df_TR[:10])\n","print(len(filtered_df_TR))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"GEIz3cMTXc76","executionInfo":{"status":"ok","timestamp":1718542446367,"user_tz":-540,"elapsed":4892,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"c483b361-5c75-445a-92b9-81cc227cf478"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":[" src \\\n","0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴 \n","1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴 \n","2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴 \n","3 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네 \n","4 콩이파리는 가시가 있어가 꺼끄럽고 뻣뻣하고 묵어 보면 맛이 없어예 \n","5 여기에는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴 \n","6 여개는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴 \n","7 음식 먹으만 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니데이 \n","8 논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 무짜로 만지만 위험합니더 \n","9 딱꾹지를 멈치지도 않고 점들 하는디 이럴 때는 우예 해야 합니껴 \n","\n"," tar \n","0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까 \n","2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까 \n","3 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네 \n","4 콩잎은 가시가 있어서 껄끄럽고 뻣뻣하고 먹어 보면 맛이 없어요 \n","5 여기에는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","6 여기는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","7 음식 먹으면 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니다 \n","8 논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 함부로 만지면 위험합니다 \n","9 딱꾹지를 멈추지도 않고 점들 하는데 이럴 때는 어떻게 해야 합니까 \n","211878\n"]}]},{"cell_type":"code","source":["# 검증 데이터 중에서 겹치는 표준어 문장과 방언 문장 제거\n","filtered_data_VL = {\n"," \"src\": [],\n"," \"tar\": []\n","}\n","\n","for i in range(0, len(dialect_sentences_VL)):\n"," if (standard_sentences_VL[i] != dialect_sentences_VL[i]):\n"," filtered_data_VL[\"src\"].append(dialect_sentences_VL[i])\n"," filtered_data_VL[\"tar\"].append(standard_sentences_VL[i])\n","\n","filtered_df_VL = pd.DataFrame(filtered_data_VL)\n","\n","print(filtered_df_VL[:10])\n","print(len(filtered_df_VL))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"kv006ObsFwYF","executionInfo":{"status":"ok","timestamp":1718542446916,"user_tz":-540,"elapsed":551,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"73a1da7d-97fb-4073-83a8-93a72ef27755"},"execution_count":9,"outputs":[{"output_type":"stream","name":"stdout","text":[" src \\\n","0 오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요 \n","1 집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼 \n","2 아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이 \n","3 옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로 \n","4 게얼에 먹을 채소나 과일 같은 것은 어데 보관을 했습니꺼 \n","5 촌구숙이라 젊은 사람들은 함부레 없고 전부 노인들만 있으이꺼네 농사 짓기가 힘들어요 \n","6 촌구석이라 젊은 사람들은 한 번이 없고 전부 노인들만 있으니까네 농사 짓기가 힘들어요 \n","7 소도 사람맨치로 잘 먹어야 근육도 붙고 심도 생겨서 일을 잘 하지로 \n","8 소도 사람 맨치로 잘 먹어야 근육도 붇고 힘도 생겨서 일을 잘 하지요 \n","9 옷가심을 짜를 때는 미리 선을 끟어 놓아야 쪽바리 잘 자를 수 있어예 \n","\n"," tar \n","0 오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요 \n","1 집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼 \n","2 아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐 \n","3 옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠 \n","4 겨울에 먹을 채소나 과일 같은 것은 어디에 보관을 했습니까 \n","5 촌구석이라 젊은 사람들은 아예 없고 전부 노인들만 있으니까 농사 짓기가 힘들어요 \n","6 촌구석이라 젊은 사람들은 한 번이 없고 전부 노인들만 있으니까 농사 짓기가 힘들어요 \n","7 소도 사람처럼 잘 먹어야 근육도 붙고 힘도 생겨서 일을 잘 하지요 \n","8 소도 사람 처럼 잘 먹어야 근육도 붇고 힘도 생겨서 일을 잘 하지요 \n","9 옷감을 자를 때는 미리 선을 그어 놓아야 똑바로 잘 자를 수 있어요 \n","27509\n"]}]},{"cell_type":"code","source":["import matplotlib\n","import matplotlib.pyplot as plt\n","\n","# 문장 길이 계산\n","def sentenceLengths(sentences):\n"," return [len(sentence.split(' ')) for sentence in sentences]"],"metadata":{"id":"OFCJmuqdOo6m","executionInfo":{"status":"ok","timestamp":1718542449093,"user_tz":-540,"elapsed":379,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":10,"outputs":[]},{"cell_type":"code","source":["plt.hist(sentenceLengths(filtered_data_TR['src']), bins=10)\n","plt.xlabel('length of dialect')\n","plt.ylabel('number of dialect')\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"id":"-zqR5FSPpN3X","executionInfo":{"status":"ok","timestamp":1718542450459,"user_tz":-540,"elapsed":965,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"20ab6680-060a-4d1d-d345-37d54af28581"},"execution_count":11,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["plt.hist(sentenceLengths(filtered_data_TR['tar']), bins=10)\n","plt.xlabel('length of standard')\n","plt.ylabel('number of standard')\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"id":"wET-0eUhp2Vv","executionInfo":{"status":"ok","timestamp":1718542451970,"user_tz":-540,"elapsed":1512,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"7e0463f7-8d80-4928-fa81-ce33856369b2"},"execution_count":12,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["def threshold_len_max(max_len, data):\n"," data = list(data) # 제네레이터를 리스트로 변환\n"," sentence_count = 0\n"," for sentence in data:\n"," if len(sentence) <= max_len:\n"," sentence_count += 1\n"," return sentence_count / len(data) * 100\n","\n","def threshold_len_min(min_len, data):\n"," data = list(data) # 제네레이터를 리스트로 변환\n"," sentence_count = 0\n"," for sentence in data:\n"," if len(sentence) >= min_len:\n"," sentence_count += 1\n"," return sentence_count / len(data) * 100"],"metadata":{"id":"SqMQxZO4p1TQ","executionInfo":{"status":"ok","timestamp":1718542451971,"user_tz":-540,"elapsed":3,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":13,"outputs":[]},{"cell_type":"code","source":["len(filtered_data_TR['src'])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cRQ_fdfSE0Rb","executionInfo":{"status":"ok","timestamp":1718542454327,"user_tz":-540,"elapsed":544,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"2f8c73cd-579e-4227-fe94-d9410da00458"},"execution_count":14,"outputs":[{"output_type":"execute_result","data":{"text/plain":["211878"]},"metadata":{},"execution_count":14}]},{"cell_type":"code","source":["max_len = 22\n","dialect_max = threshold_len_max(max_len, (sentence.split(' ') for sentence in filtered_data_TR['src']))\n","standard_max = threshold_len_max(max_len, (sentence.split(' ') for sentence in filtered_data_TR['tar']))\n","\n","print(f\"dialect 중 {max_len} 이하인 비율은 {dialect_max}\")\n","print(f\"standard 중 {max_len} 이하인 비율은 {standard_max}\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ali5lQXaqSf0","executionInfo":{"status":"ok","timestamp":1718542463405,"user_tz":-540,"elapsed":4832,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"a264fcfc-32c1-473c-cee7-e784444f8035"},"execution_count":15,"outputs":[{"output_type":"stream","name":"stdout","text":["dialect 중 22 이하인 비율은 80.23060440442141\n","standard 중 22 이하인 비율은 80.11355591425254\n"]}]},{"cell_type":"code","source":["## 문장의 길이가 긴 것이 많아 80프로 정도의 데이터만 남김\n","\n","d_filter_indices = [i for i, sentence in enumerate(sentence.split(' ') for sentence in filtered_data_TR['src']) if len(sentence) <= max_len ]\n","s_filter_indices = [i for i, sentence in enumerate(sentence.split(' ') for sentence in filtered_data_TR['tar']) if len(sentence) <= max_len ]"],"metadata":{"id":"iLXOEUz2u45D","executionInfo":{"status":"ok","timestamp":1718542028255,"user_tz":-540,"elapsed":521,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":76,"outputs":[]},{"cell_type":"code","source":["indices = list(set(d_filter_indices) & set(s_filter_indices))"],"metadata":{"id":"aV630gtgwMDM","executionInfo":{"status":"ok","timestamp":1718542028256,"user_tz":-540,"elapsed":3,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":77,"outputs":[]},{"cell_type":"code","source":["len(indices)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"U2I4XBAtPd_b","executionInfo":{"status":"ok","timestamp":1718542028256,"user_tz":-540,"elapsed":3,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"6766e721-a539-4d7e-b728-5484beabcff9"},"execution_count":78,"outputs":[{"output_type":"execute_result","data":{"text/plain":["169723"]},"metadata":{},"execution_count":78}]},{"cell_type":"code","source":["import pickle\n","1\n","# pickle 파일로부터 데이터를 불러옴\n","with open('/content/drive/MyDrive/LSTM+attention/filtered_dialect.pkl', 'rb') as f:\n"," filtered_dialect = pickle.load(f)\n","\n","with open('/content/drive/MyDrive/LSTM+attention/filtered_standard.pkl', 'rb') as f:\n"," filtered_standard = pickle.load(f)\n","\n","# 불러온 데이터를 확인\n","print(filtered_dialect[:10])\n","print(filtered_standard[:10])"],"metadata":{"id":"B041nyJnISFJ","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1718542466153,"user_tz":-540,"elapsed":1852,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"bafb6241-5772-4872-b2c5-2fcb9e58e4d9"},"execution_count":16,"outputs":[{"output_type":"stream","name":"stdout","text":["['여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴', '장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴', '예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴', '이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네', '콩이파리는 가시가 있어가 꺼끄럽고 뻣뻣하고 묵어 보면 맛이 없어예', '여기에는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴', '여개는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴', '음식 먹으만 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니데이', '논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 무짜로 만지만 위험합니더', '딱꾹지를 멈치지도 않고 점들 하는디 이럴 때는 우예 해야 합니껴']\n","['여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까', '장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까', '예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까', '이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네', '콩잎은 가시가 있어서 껄끄럽고 뻣뻣하고 먹어 보면 맛이 없어요', '여기에는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까', '여기는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까', '음식 먹으면 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니다', '논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 함부로 만지면 위험합니다', '딱꾹지를 멈추지도 않고 점들 하는데 이럴 때는 어떻게 해야 합니까']\n"]}]},{"cell_type":"code","source":["print(len(filtered_dialect))\n","print(len(filtered_standard))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"_dUEMZ8HRPow","executionInfo":{"status":"ok","timestamp":1718542468332,"user_tz":-540,"elapsed":380,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"9478287a-88f0-4636-9c96-59d0e51d4aff"},"execution_count":17,"outputs":[{"output_type":"stream","name":"stdout","text":["169723\n","169723\n"]}]},{"cell_type":"code","source":["plt.hist(sentenceLengths(filtered_dialect), bins=10)\n","plt.xlabel('length of dialect')\n","plt.ylabel('number of dialect')\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"id":"yf8viS-nR3bN","executionInfo":{"status":"ok","timestamp":1718542470919,"user_tz":-540,"elapsed":928,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"fa38f81d-da41-42c6-d94f-012ca06d2ecb"},"execution_count":18,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["plt.hist(sentenceLengths(filtered_standard), bins=10)\n","plt.xlabel('length of standard')\n","plt.ylabel('number of standard')\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"id":"2g430fC7RliO","executionInfo":{"status":"ok","timestamp":1718542471337,"user_tz":-540,"elapsed":419,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"64597664-0dde-47eb-873a-796ac99f130e"},"execution_count":19,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["SOS_token = 0\n","EOS_token = 0\n","\n","class Lang:\n"," def __init__(self, name):\n"," self.name = name\n"," self.word2index = {}\n"," self.word2count = {}\n"," self.index2word = {0: \"SOS\", 1: \"EOS\"}\n"," self.n_words = 2 # SOS, EOS\n","\n"," def addSentence(self, sentence):\n"," for word in sentence.split(\" \"):\n"," self.addWord(word)\n","\n"," def addWord(self, word):\n"," if word not in self.word2index:\n"," self.word2index[word] = self.n_words\n"," self.word2count[word] = 1\n"," self.index2word[self.n_words] = word\n"," self.n_words += 1\n"," else:\n"," self.word2count[word] += 1"],"metadata":{"id":"oMl0xGNU49XX","executionInfo":{"status":"ok","timestamp":1718542471337,"user_tz":-540,"elapsed":2,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":20,"outputs":[]},{"cell_type":"code","source":["# Lang 객체 생성\n","dialect_lang = Lang(\"Dialect\")\n","standard_lang = Lang(\"Standard\")\n","\n","# 문장 추가\n","for sentence in filtered_dialect:\n"," dialect_lang.addSentence(sentence)\n","for sentence in filtered_standard:\n"," standard_lang.addSentence(sentence)\n","for sentence in filtered_df_VL['src']:\n"," dialect_lang.addSentence(sentence)\n","for sentence in filtered_df_VL['tar']:\n"," standard_lang.addSentence(sentence)\n","\n","# 문장\n","pairs = list(zip(filtered_dialect, filtered_standard))\n","VL_pairs = list(zip(filtered_df_VL['src'], filtered_df_VL['tar']))\n","\n","# 문장을 인덱스로 변환\n","def indexesFromSentence(lang, sentence):\n"," return [lang.word2index[word] for word in sentence.split(' ')]\n","\n","def tensorFromSentence(lang, sentence):\n"," indexes = indexesFromSentence(lang, sentence)\n"," indexes.append(EOS_token)\n"," if len(indexes) < max_len:\n"," indexes.extend([EOS_token] * (max_len - len(indexes))) # 패딩 추가\n"," return torch.tensor(indexes[:max_len], dtype=torch.long).view(-1, 1)\n","\n","def tensorsFromPair(pair):\n"," input_tensor = tensorFromSentence(dialect_lang, pair[0])\n"," target_tensor = tensorFromSentence(standard_lang, pair[1])\n"," return (input_tensor, target_tensor)"],"metadata":{"id":"VBPYjCbZ8l6k","executionInfo":{"status":"ok","timestamp":1718542475623,"user_tz":-540,"elapsed":2551,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":21,"outputs":[]},{"cell_type":"code","source":["import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","\n","# 검증 데이터를 인덱스로 변환\n","validation_input_tensors = [tensorFromSentence(dialect_lang, pair[0]) for pair in VL_pairs]\n","validation_target_tensors = [tensorFromSentence(standard_lang, pair[1]) for pair in VL_pairs]\n","\n","class EncoderRNN(nn.Module):\n"," def __init__(self, input_size, hidden_size):\n"," super(EncoderRNN, self).__init__()\n"," self.hidden_size = hidden_size\n"," self.embedding = nn.Embedding(input_size, hidden_size)\n"," self.lstm = nn.LSTM(hidden_size, hidden_size)\n","\n"," def forward(self, input, hidden):\n"," embedded = self.embedding(input).view(1, 1, -1)\n"," output, hidden = self.lstm(embedded, hidden)\n"," return output, hidden\n","\n"," def initHidden(self):\n"," return (torch.zeros(1, 1, self.hidden_size),\n"," torch.zeros(1, 1, self.hidden_size))\n","\n","class AttnDecoderRNN(nn.Module):\n"," def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=max_len):\n"," super(AttnDecoderRNN, self).__init__()\n"," self.hidden_size = hidden_size\n"," self.output_size = output_size\n"," self.dropout_p = dropout_p\n"," self.max_length = max_length\n","\n"," self.embedding = nn.Embedding(self.output_size, self.hidden_size)\n"," self.attn = nn.Linear(self.hidden_size * 2, self.max_length)\n"," self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)\n"," self.dropout = nn.Dropout(self.dropout_p)\n"," self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)\n"," self.out = nn.Linear(self.hidden_size, self.output_size)\n","\n"," def forward(self, input, hidden, encoder_outputs):\n"," embedded = self.embedding(input).view(1, 1, -1)\n"," embedded = self.dropout(embedded)\n","\n"," attn_weights = nn.functional.softmax(\n"," self.attn(torch.cat((embedded[0], hidden[0][0]), 1)), dim=1)\n"," attn_applied = torch.bmm(attn_weights.unsqueeze(0),\n"," encoder_outputs.unsqueeze(0))\n","\n"," output = torch.cat((embedded[0], attn_applied[0]), 1)\n"," output = self.attn_combine(output).unsqueeze(0)\n","\n"," output = nn.functional.relu(output)\n"," output, hidden = self.lstm(output, hidden)\n","\n"," output = nn.functional.log_softmax(self.out(output[0]), dim=1)\n"," return output, hidden, attn_weights\n","\n"," def initHidden(self):\n"," return (torch.zeros(1, 1, self.hidden_size),\n"," torch.zeros(1, 1, self.hidden_size))"],"metadata":{"id":"EyqODVGn87BL","executionInfo":{"status":"ok","timestamp":1718542480766,"user_tz":-540,"elapsed":5144,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":22,"outputs":[]},{"cell_type":"code","source":["import random\n","import time\n","import math\n","\n","def asMinutes(s):\n"," m = math.floor(s / 60)\n"," s -= m * 60\n"," return f'{m}m {s:.2f}s'\n","\n","def timeSince(since, percent):\n"," now = time.time()\n"," s = now - since\n"," es = s / (percent)\n"," rs = es - s\n"," return f'{asMinutes(s)} (- {asMinutes(rs)})'\n","\n","def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=max_len):\n"," encoder_hidden = encoder.initHidden()\n","\n"," encoder_optimizer.zero_grad()\n"," decoder_optimizer.zero_grad()\n","\n"," input_length = input_tensor.size(0)\n"," target_length = target_tensor.size(0)\n","\n"," encoder_outputs = torch.zeros(max_length, encoder.hidden_size)\n","\n"," loss = 0\n","\n"," for ei in range(input_length):\n"," encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)\n"," encoder_outputs[ei] = encoder_output[0, 0]\n","\n"," decoder_input = torch.tensor([[SOS_token]])\n","\n"," decoder_hidden = encoder_hidden\n","\n"," for di in range(target_length):\n"," decoder_output, decoder_hidden, decoder_attention = decoder(\n"," decoder_input, decoder_hidden, encoder_outputs)\n"," topv, topi = decoder_output.topk(1)\n"," decoder_input = topi.squeeze().detach() # 다음 입력으로 사용\n","\n"," loss += criterion(decoder_output, target_tensor[di])\n"," if decoder_input.item() == EOS_token:\n"," break\n","\n"," loss.backward()\n","\n"," encoder_optimizer.step()\n"," decoder_optimizer.step()\n","\n"," return loss.item() / target_length\n","\n","def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):\n"," start = time.time()\n"," plot_losses = []\n"," print_loss_total = 0\n"," plot_loss_total = 0\n","\n"," encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)\n"," decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)\n"," training_pairs = [tensorsFromPair(random.choice(pairs)) for _ in range(n_iters)]\n"," criterion = nn.NLLLoss()\n","\n"," for iter in range(1, n_iters + 1):\n"," training_pair = training_pairs[iter - 1]\n"," input_tensor = training_pair[0]\n"," target_tensor = training_pair[1]\n","\n"," loss = train(input_tensor, target_tensor, encoder,\n"," decoder, encoder_optimizer, decoder_optimizer, criterion)\n"," print_loss_total += loss\n"," plot_loss_total += loss\n","\n"," if iter % print_every == 0:\n"," print_loss_avg = print_loss_total / print_every\n"," print_loss_total = 0\n"," print(f'{timeSince(start, iter / n_iters)} ({iter} {iter / n_iters * 100:.2f}%) {print_loss_avg:.4f}')\n","\n"," if iter % plot_every == 0:\n"," plot_loss_avg = plot_loss_total / plot_every\n"," plot_losses.append(plot_loss_avg)\n"," plot_loss_total = 0\n","\n"," showPlot(plot_losses)\n","\n","def showPlot(points):\n"," plt.figure()\n"," plt.plot(points)\n"," plt.title('Training Loss')\n"," plt.xlabel('Iterations')\n"," plt.ylabel('Loss')\n"," plt.show()"],"metadata":{"id":"Uaozw3dc_vdk","executionInfo":{"status":"ok","timestamp":1718542482762,"user_tz":-540,"elapsed":378,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":23,"outputs":[]},{"cell_type":"code","source":["# 모델 초기화 및 훈련\n","hidden_size = 256\n","encoder = EncoderRNN(dialect_lang.n_words, hidden_size)\n","decoder = AttnDecoderRNN(hidden_size, standard_lang.n_words, dropout_p=0.1)\n","\n","trainIters(encoder, decoder, 1000, print_every=100, plot_every=50) # 적은 수의 iteration으로 실행"],"metadata":{"id":"JLgmcaB5UKtN","colab":{"base_uri":"https://localhost:8080/","height":656},"executionInfo":{"status":"ok","timestamp":1718546219515,"user_tz":-540,"elapsed":3734744,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"bbd58d91-3893-4be7-d31e-f26b407723fb"},"execution_count":24,"outputs":[{"output_type":"stream","name":"stdout","text":["6m 1.12s (- 54m 10.07s) (100 10.00%) 1.4042\n","12m 43.51s (- 50m 54.02s) (200 20.00%) 2.2098\n","19m 8.74s (- 44m 40.40s) (300 30.00%) 1.8428\n","24m 53.66s (- 37m 20.50s) (400 40.00%) 0.8409\n","30m 32.42s (- 30m 32.42s) (500 50.00%) 0.8369\n","36m 22.89s (- 24m 15.26s) (600 60.00%) 0.9774\n","42m 28.65s (- 18m 12.28s) (700 70.00%) 1.2321\n","49m 3.86s (- 12m 15.96s) (800 80.00%) 1.8149\n","55m 38.35s (- 6m 10.93s) (900 90.00%) 1.7333\n","62m 13.26s (- 0m 0.00s) (1000 100.00%) 1.7171\n"]},{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["def saveModel(encoder, decoder, encoder_path='encoder.pth', decoder_path='decoder.pth'): ## 모델 저장\n"," torch.save(encoder.state_dict(), encoder_path)\n"," torch.save(decoder.state_dict(), decoder_path)\n","\n","def loadModel(encoder_path='encoder.pth', decoder_path='decoder.pth'): ## 모델 로드\n"," encoder = EncoderRNN(dialect_lang.n_words, hidden_size)\n"," decoder = AttnDecoderRNN(hidden_size, standard_lang.n_words, dropout_p=0.1)\n"," encoder.load_state_dict(torch.load(encoder_path))\n"," decoder.load_state_dict(torch.load(decoder_path))\n"," return encoder, decoder"],"metadata":{"id":"_Gjpck4MUHBm","executionInfo":{"status":"ok","timestamp":1718546220678,"user_tz":-540,"elapsed":2,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":25,"outputs":[]},{"cell_type":"code","source":["encoder_path = '/content/drive/MyDrive/LSTM+attention/test_encoder.pth'\n","decoder_path = '/content/drive/MyDrive/LSTM+attention/test_decoder.pth'\n","\n","saveModel(encoder, decoder, encoder_path, decoder_path)"],"metadata":{"id":"TqE3502bKBs5","executionInfo":{"status":"ok","timestamp":1718546227156,"user_tz":-540,"elapsed":6479,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":26,"outputs":[]},{"cell_type":"code","source":["# 테스트 함수\n","\n","def evaluate(encoder, decoder, sentence, max_length=max_len):\n"," with torch.no_grad():\n"," input_tensor = tensorFromSentence(dialect_lang, sentence)\n"," input_length = input_tensor.size()[0]\n"," encoder_hidden = encoder.initHidden()\n","\n"," encoder_outputs = torch.zeros(max_length, encoder.hidden_size)\n","\n"," for ei in range(input_length):\n"," encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)\n"," encoder_outputs[ei] = encoder_output[0, 0]\n","\n"," decoder_input = torch.tensor([[SOS_token]]) # SOS token\n"," decoder_hidden = encoder_hidden\n","\n"," decoded_words = []\n"," decoder_attentions = torch.zeros(max_length, max_length)\n","\n"," for di in range(max_length):\n"," decoder_output, decoder_hidden, decoder_attention = decoder(\n"," decoder_input, decoder_hidden, encoder_outputs)\n"," decoder_attentions[di] = decoder_attention.data\n"," topv, topi = decoder_output.data.topk(1)\n"," if topi.item() == EOS_token:\n"," decoded_words.append('')\n"," break\n"," else:\n"," decoded_words.append(standard_lang.index2word[topi.item()])\n","\n"," decoder_input = topi.squeeze().detach()\n","\n"," return decoded_words\n","\n","def evaluateRandomly(encoder, decoder, n=10):\n"," for i in range(n):\n"," pair = random.choice(test_pairs)\n"," print('Dialect:', pair[0])\n"," print('Expected:', pair[1])\n"," output_words = evaluate(encoder, decoder, pair[0])\n"," output_sentence = ' '.join(output_words)\n"," print('Predicted:', output_sentence)\n"," print('')\n"],"metadata":{"id":"zYySN_5AUvbG"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["## 테스트 데이터 준비 필요\n","test_dialect_sentences = []\n","test_standard_sentences = []\n","\n","test_pairs = list(zip(test_dialect_sentences, test_standard_sentences))"],"metadata":{"id":"ch8xAa69U5DA"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 저장된 모델 불러오기\n","encoder = EncoderRNN(dialect_lang.n_words, hidden_size)\n","decoder = AttnDecoderRNN(hidden_size, standard_lang.n_words, dropout_p=0.1)\n","\n","loadModel(encoder, decoder, encoder_path, decoder_path)"],"metadata":{"id":"THo_PKRYM4vP"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["## 테스트 함수 실행\n","evaluateRandomly(encoder, decoder, n=len(test_pairs))"],"metadata":{"id":"JQNbhsGTVRCe"},"execution_count":null,"outputs":[]}]} \ No newline at end of file From fdd6904fe9bfcb4571bfd7a46c3922c77d9c8855 Mon Sep 17 00:00:00 2001 From: edcrfv458 Date: Mon, 17 Jun 2024 05:08:28 +0900 Subject: [PATCH 2/3] =?UTF-8?q?Del:=20=ED=85=8C=EC=8A=A4=ED=8A=B8=20?= =?UTF-8?q?=ED=8C=8C=EC=9D=BC=20=EC=82=AD=EC=A0=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AI/LSTM_attention_test.ipynb | 1 - 1 file changed, 1 deletion(-) delete mode 100644 AI/LSTM_attention_test.ipynb diff --git a/AI/LSTM_attention_test.ipynb b/AI/LSTM_attention_test.ipynb deleted file mode 100644 index a68fab1..0000000 --- a/AI/LSTM_attention_test.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"gpuType":"T4","authorship_tag":"ABX9TyNByVSUfJ94Bpl8oocV6J3f"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ix1Jbg_xWTxg","executionInfo":{"status":"ok","timestamp":1718542427282,"user_tz":-540,"elapsed":19043,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"121c7e9e-b3b8-4f2b-f10c-dd800d4676c7"},"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","source":["import pandas as pd\n","import numpy as np\n","import os\n","import json\n","import csv\n","\n","TL_sentence_path = '/content/drive/MyDrive/LSTM+attention/sentence_dataTL.csv'\n","VL_sentence_path = '/content/drive/MyDrive/LSTM+attention/sentence_dataVL.csv'\n","\n","# data파일 불러오기\n","TL_sentence_data = pd.read_csv(TL_sentence_path, encoding='utf-8')\n","VL_sentence_data = pd.read_csv(VL_sentence_path, encoding='utf-8')\n","\n","# 중복 제거, Pronuncication 열은 필요 없다고 생각\n","TL_sentence_data.drop('Pronunciation', axis=1, inplace=True)\n","TL_sentence_data = TL_sentence_data.drop_duplicates().reset_index(drop=True)\n","VL_sentence_data.drop('Pronunciation', axis=1, inplace=True)\n","VL_sentence_data = VL_sentence_data.drop_duplicates().reset_index(drop=True)"],"metadata":{"id":"xPCQBU1BWfcw","executionInfo":{"status":"ok","timestamp":1718542436382,"user_tz":-540,"elapsed":7833,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["TL_sentence_data[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"UZGjs0aPXrCe","executionInfo":{"status":"ok","timestamp":1718542436382,"user_tz":-540,"elapsed":3,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"928eece9-4d05-4c04-90db-287d3dd7f8b9"},"execution_count":3,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Dialect \\\n","0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴 \n","1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴 \n","2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴 \n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까 \n","4 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네 \n","\n"," Standard \n","0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까 \n","2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까 \n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까 \n","4 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네 "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
DialectStandard
0여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까
1장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까
2예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까
3음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까
4이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"TL_sentence_data[:5]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Dialect\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\uc7a5\\ub840 \\uad00\\ub828\\ud574\\uc11c \\ucd08\\uc0c1\\uc9d1 \\uac19\\uc740 \\ub370 \\uac00\\uac00 \\ud558\\uc9c0 \\ub9d0\\uc544\\uc57c \\ub370\\ub294 \\uae30 \\uc788\\uc2b5\\ub2c8\\uaef4\",\n \"\\uc774 \\uad6c\\ub450 \\ud558\\ub098\\ub9cc \\uacc4\\uc18d \\uc2e0\\uace0 \\ub315\\uae30\\uc774\\uaebc\\ub124 \\uc778\\uc790 \\uad7d\\uc774 \\ub9ce\\uc774 \\ub2f3\\uc544\\uc11c \\uac08\\uc544\\uc57c \\ub418\\uaca0\\ub124\",\n \"\\uc608\\uc804\\uc5d0\\ub294 \\uc9d1 \\uc548\\uc5d0\\uc11c \\uc5ec\\uc790\\ub4e4\\uc774 \\ub0a8\\uc790 \\uc704\\ub85c \\ub760\\ub118\\uc73c\\uba74 \\uc548 \\ub374\\ub2e4 \\ucea4\\uc2b5\\ub2c8\\uaef4\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Standard\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\uc7a5\\ub840 \\uad00\\ub828\\ud574\\uc11c \\ucd08\\uc0c1\\uc9d1 \\uac19\\uc740 \\ub370 \\uac00\\uc11c \\ud558\\uc9c0 \\ub9d0\\uc544\\uc57c \\ub370\\ub294 \\uac8c \\uc788\\uc2b5\\ub2c8\\uae4c\",\n \"\\uc774 \\uad6c\\ub450 \\ud558\\ub098\\ub9cc \\uacc4\\uc18d \\uc2e0\\uace0 \\ub2e4\\ub2c8\\ub2c8\\uae4c \\uc774\\uc81c \\uad7d\\uc774 \\ub9ce\\uc774 \\ub2f3\\uc544\\uc11c \\uac08\\uc544\\uc57c \\ub418\\uaca0\\ub124\",\n \"\\uc608\\uc804\\uc5d0\\ub294 \\uc9d1 \\uc548\\uc5d0\\uc11c \\uc5ec\\uc790\\ub4e4\\uc774 \\ub0a8\\uc790 \\uc704\\ub85c \\ub6f0\\uc5b4\\ub118\\uc73c\\uba74 \\uc548 \\ub41c\\ub2e4 \\ud588\\uc2b5\\ub2c8\\uae4c\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":3}]},{"cell_type":"code","source":["VL_sentence_data[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"a0cWFdpxDKN7","executionInfo":{"status":"ok","timestamp":1718542436382,"user_tz":-540,"elapsed":2,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"f9a0098f-fb64-4747-dce7-1ec1462da2bb"},"execution_count":4,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Dialect \\\n","0 오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요 \n","1 혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다 \n","2 집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼 \n","3 아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이 \n","4 옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로 \n","\n"," Standard \n","0 오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요 \n","1 혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다 \n","2 집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼 \n","3 아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐 \n","4 옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠 "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
DialectStandard
0오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요
1혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다
2집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼
3아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐
4옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"VL_sentence_data[:5]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Dialect\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\ud608\\uc555\\uc57d\\uc740 \\uc2dc\\uac04\\uc744 \\ub9de\\ucdb0 \\ucc59\\uaca8 \\ub4dc\\uc154\\uc57c\\uc9c0 \\uc548 \\uadf8\\ub7ec\\uba74 \\ud6a8\\uacfc\\uac00 \\uc5c6\\uc2b5\\ub2c8\\ub2e4\",\n \"\\uc61b\\ub0a0\\ubd80\\ud130 \\uc870\\uc0c1\\uafc8\\uc774\\ub098 \\ub3fc\\uc9c0\\uafc8 \\uafb8\\ub9cc \\uc9d1\\uc5d0 \\ub3c8 \\ub9ce\\uc774 \\ub4e4\\uc5b4\\uc628\\ub2e4\\uace0 \\uc88b\\uc544 \\ud574\\uc9c0\\ub85c\",\n \"\\uc9d1\\uc5d0 \\ub3cc\\uc544\\uc640 \\ubcf4\\uc774\\uaebc\\ub124 \\ubb38\\uc774 \\uc5f4\\ub824 \\uc788\\uace0 \\ubf08\\ub2e4\\uc9c0\\uac00 \\uc5f4\\uc5b4\\ub454 \\ub3c8 \\uc804\\ubd80 \\uc5c6\\uc5b4\\uc9c0\\ub358 \\uc5b4\\uc774\\ub5bc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Standard\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\ud608\\uc555\\uc57d\\uc740 \\uc2dc\\uac04\\uc744 \\ub9de\\ucdb0 \\ucc59\\uaca8 \\ub4dc\\uc154\\uc57c\\uc9c0 \\uc548 \\uadf8\\ub7ec\\uba74 \\ud6a8\\uacfc\\uac00 \\uc5c6\\uc2b5\\ub2c8\\ub2e4\",\n \"\\uc61b\\ub0a0\\ubd80\\ud130 \\uc870\\uc0c1\\uafc8\\uc774\\ub098 \\ub3fc\\uc9c0\\uafc8 \\uafb8\\uba74 \\uc9d1\\uc5d0 \\ub3c8 \\ub9ce\\uc774 \\ub4e4\\uc5b4\\uc628\\ub2e4\\uace0 \\uc88b\\uc544 \\ud588\\uc8e0\",\n \"\\uc9d1\\uc5d0 \\ub3cc\\uc544\\uc640 \\ubcf4\\ub2c8\\uae4c \\ubb38\\uc774 \\uc5f4\\ub824 \\uc788\\uace0 \\uc11c\\ub78d\\uc774 \\uc5f4\\uc5b4\\ub454 \\ub3c8 \\uc804\\ubd80 \\uc5c6\\uc5b4\\uc9c0\\ub358 \\uc5b4\\uc774\\ub5bc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":4}]},{"cell_type":"code","source":["standard_sentences_TL = TL_sentence_data['Standard']\n","dialect_sentences_TL = TL_sentence_data['Dialect']\n","standard_sentences_VL = VL_sentence_data['Standard']\n","dialect_sentences_VL = VL_sentence_data['Dialect']"],"metadata":{"id":"jlxCy4d3WyDB","executionInfo":{"status":"ok","timestamp":1718542437775,"user_tz":-540,"elapsed":1,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":5,"outputs":[]},{"cell_type":"code","source":["standard_sentences_TL[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"aj-awCcGFReV","executionInfo":{"status":"ok","timestamp":1718542439773,"user_tz":-540,"elapsed":1,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"218f7804-9aff-4647-8c07-9632c99fc102"},"execution_count":6,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까\n","1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까\n","2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까\n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까\n","4 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네\n","Name: Standard, dtype: object"]},"metadata":{},"execution_count":6}]},{"cell_type":"code","source":["dialect_sentences_TL[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ctrb-c6hFWAX","executionInfo":{"status":"ok","timestamp":1718542440184,"user_tz":-540,"elapsed":1,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"1a040177-63e5-46dd-98d6-b0d2db7237fa"},"execution_count":7,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴\n","1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴\n","2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴\n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까\n","4 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네\n","Name: Dialect, dtype: object"]},"metadata":{},"execution_count":7}]},{"cell_type":"code","source":["# 학습 데이터 중에서 겹치는 표준어 문장과 방언 문장 제거\n","filtered_data_TR = {\n"," \"src\": [],\n"," \"tar\": []\n","}\n","\n","for i in range(0, len(dialect_sentences_TL)):\n"," if (standard_sentences_TL[i] != dialect_sentences_TL[i]):\n"," filtered_data_TR[\"src\"].append(dialect_sentences_TL[i])\n"," filtered_data_TR[\"tar\"].append(standard_sentences_TL[i])\n","\n","filtered_df_TR = pd.DataFrame(filtered_data_TR)\n","\n","print(filtered_df_TR[:10])\n","print(len(filtered_df_TR))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"GEIz3cMTXc76","executionInfo":{"status":"ok","timestamp":1718542446367,"user_tz":-540,"elapsed":4892,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"c483b361-5c75-445a-92b9-81cc227cf478"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":[" src \\\n","0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴 \n","1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴 \n","2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴 \n","3 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네 \n","4 콩이파리는 가시가 있어가 꺼끄럽고 뻣뻣하고 묵어 보면 맛이 없어예 \n","5 여기에는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴 \n","6 여개는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴 \n","7 음식 먹으만 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니데이 \n","8 논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 무짜로 만지만 위험합니더 \n","9 딱꾹지를 멈치지도 않고 점들 하는디 이럴 때는 우예 해야 합니껴 \n","\n"," tar \n","0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까 \n","2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까 \n","3 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네 \n","4 콩잎은 가시가 있어서 껄끄럽고 뻣뻣하고 먹어 보면 맛이 없어요 \n","5 여기에는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","6 여기는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","7 음식 먹으면 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니다 \n","8 논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 함부로 만지면 위험합니다 \n","9 딱꾹지를 멈추지도 않고 점들 하는데 이럴 때는 어떻게 해야 합니까 \n","211878\n"]}]},{"cell_type":"code","source":["# 검증 데이터 중에서 겹치는 표준어 문장과 방언 문장 제거\n","filtered_data_VL = {\n"," \"src\": [],\n"," \"tar\": []\n","}\n","\n","for i in range(0, len(dialect_sentences_VL)):\n"," if (standard_sentences_VL[i] != dialect_sentences_VL[i]):\n"," filtered_data_VL[\"src\"].append(dialect_sentences_VL[i])\n"," filtered_data_VL[\"tar\"].append(standard_sentences_VL[i])\n","\n","filtered_df_VL = pd.DataFrame(filtered_data_VL)\n","\n","print(filtered_df_VL[:10])\n","print(len(filtered_df_VL))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"kv006ObsFwYF","executionInfo":{"status":"ok","timestamp":1718542446916,"user_tz":-540,"elapsed":551,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"73a1da7d-97fb-4073-83a8-93a72ef27755"},"execution_count":9,"outputs":[{"output_type":"stream","name":"stdout","text":[" src \\\n","0 오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요 \n","1 집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼 \n","2 아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이 \n","3 옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로 \n","4 게얼에 먹을 채소나 과일 같은 것은 어데 보관을 했습니꺼 \n","5 촌구숙이라 젊은 사람들은 함부레 없고 전부 노인들만 있으이꺼네 농사 짓기가 힘들어요 \n","6 촌구석이라 젊은 사람들은 한 번이 없고 전부 노인들만 있으니까네 농사 짓기가 힘들어요 \n","7 소도 사람맨치로 잘 먹어야 근육도 붙고 심도 생겨서 일을 잘 하지로 \n","8 소도 사람 맨치로 잘 먹어야 근육도 붇고 힘도 생겨서 일을 잘 하지요 \n","9 옷가심을 짜를 때는 미리 선을 끟어 놓아야 쪽바리 잘 자를 수 있어예 \n","\n"," tar \n","0 오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요 \n","1 집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼 \n","2 아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐 \n","3 옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠 \n","4 겨울에 먹을 채소나 과일 같은 것은 어디에 보관을 했습니까 \n","5 촌구석이라 젊은 사람들은 아예 없고 전부 노인들만 있으니까 농사 짓기가 힘들어요 \n","6 촌구석이라 젊은 사람들은 한 번이 없고 전부 노인들만 있으니까 농사 짓기가 힘들어요 \n","7 소도 사람처럼 잘 먹어야 근육도 붙고 힘도 생겨서 일을 잘 하지요 \n","8 소도 사람 처럼 잘 먹어야 근육도 붇고 힘도 생겨서 일을 잘 하지요 \n","9 옷감을 자를 때는 미리 선을 그어 놓아야 똑바로 잘 자를 수 있어요 \n","27509\n"]}]},{"cell_type":"code","source":["import matplotlib\n","import matplotlib.pyplot as plt\n","\n","# 문장 길이 계산\n","def sentenceLengths(sentences):\n"," return [len(sentence.split(' ')) for sentence in sentences]"],"metadata":{"id":"OFCJmuqdOo6m","executionInfo":{"status":"ok","timestamp":1718542449093,"user_tz":-540,"elapsed":379,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":10,"outputs":[]},{"cell_type":"code","source":["plt.hist(sentenceLengths(filtered_data_TR['src']), bins=10)\n","plt.xlabel('length of dialect')\n","plt.ylabel('number of dialect')\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"id":"-zqR5FSPpN3X","executionInfo":{"status":"ok","timestamp":1718542450459,"user_tz":-540,"elapsed":965,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"20ab6680-060a-4d1d-d345-37d54af28581"},"execution_count":11,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["plt.hist(sentenceLengths(filtered_data_TR['tar']), bins=10)\n","plt.xlabel('length of standard')\n","plt.ylabel('number of standard')\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"id":"wET-0eUhp2Vv","executionInfo":{"status":"ok","timestamp":1718542451970,"user_tz":-540,"elapsed":1512,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"7e0463f7-8d80-4928-fa81-ce33856369b2"},"execution_count":12,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["def threshold_len_max(max_len, data):\n"," data = list(data) # 제네레이터를 리스트로 변환\n"," sentence_count = 0\n"," for sentence in data:\n"," if len(sentence) <= max_len:\n"," sentence_count += 1\n"," return sentence_count / len(data) * 100\n","\n","def threshold_len_min(min_len, data):\n"," data = list(data) # 제네레이터를 리스트로 변환\n"," sentence_count = 0\n"," for sentence in data:\n"," if len(sentence) >= min_len:\n"," sentence_count += 1\n"," return sentence_count / len(data) * 100"],"metadata":{"id":"SqMQxZO4p1TQ","executionInfo":{"status":"ok","timestamp":1718542451971,"user_tz":-540,"elapsed":3,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":13,"outputs":[]},{"cell_type":"code","source":["len(filtered_data_TR['src'])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cRQ_fdfSE0Rb","executionInfo":{"status":"ok","timestamp":1718542454327,"user_tz":-540,"elapsed":544,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"2f8c73cd-579e-4227-fe94-d9410da00458"},"execution_count":14,"outputs":[{"output_type":"execute_result","data":{"text/plain":["211878"]},"metadata":{},"execution_count":14}]},{"cell_type":"code","source":["max_len = 22\n","dialect_max = threshold_len_max(max_len, (sentence.split(' ') for sentence in filtered_data_TR['src']))\n","standard_max = threshold_len_max(max_len, (sentence.split(' ') for sentence in filtered_data_TR['tar']))\n","\n","print(f\"dialect 중 {max_len} 이하인 비율은 {dialect_max}\")\n","print(f\"standard 중 {max_len} 이하인 비율은 {standard_max}\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ali5lQXaqSf0","executionInfo":{"status":"ok","timestamp":1718542463405,"user_tz":-540,"elapsed":4832,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"a264fcfc-32c1-473c-cee7-e784444f8035"},"execution_count":15,"outputs":[{"output_type":"stream","name":"stdout","text":["dialect 중 22 이하인 비율은 80.23060440442141\n","standard 중 22 이하인 비율은 80.11355591425254\n"]}]},{"cell_type":"code","source":["## 문장의 길이가 긴 것이 많아 80프로 정도의 데이터만 남김\n","\n","d_filter_indices = [i for i, sentence in enumerate(sentence.split(' ') for sentence in filtered_data_TR['src']) if len(sentence) <= max_len ]\n","s_filter_indices = [i for i, sentence in enumerate(sentence.split(' ') for sentence in filtered_data_TR['tar']) if len(sentence) <= max_len ]"],"metadata":{"id":"iLXOEUz2u45D","executionInfo":{"status":"ok","timestamp":1718542028255,"user_tz":-540,"elapsed":521,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":76,"outputs":[]},{"cell_type":"code","source":["indices = list(set(d_filter_indices) & set(s_filter_indices))"],"metadata":{"id":"aV630gtgwMDM","executionInfo":{"status":"ok","timestamp":1718542028256,"user_tz":-540,"elapsed":3,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":77,"outputs":[]},{"cell_type":"code","source":["len(indices)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"U2I4XBAtPd_b","executionInfo":{"status":"ok","timestamp":1718542028256,"user_tz":-540,"elapsed":3,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"6766e721-a539-4d7e-b728-5484beabcff9"},"execution_count":78,"outputs":[{"output_type":"execute_result","data":{"text/plain":["169723"]},"metadata":{},"execution_count":78}]},{"cell_type":"code","source":["import pickle\n","1\n","# pickle 파일로부터 데이터를 불러옴\n","with open('/content/drive/MyDrive/LSTM+attention/filtered_dialect.pkl', 'rb') as f:\n"," filtered_dialect = pickle.load(f)\n","\n","with open('/content/drive/MyDrive/LSTM+attention/filtered_standard.pkl', 'rb') as f:\n"," filtered_standard = pickle.load(f)\n","\n","# 불러온 데이터를 확인\n","print(filtered_dialect[:10])\n","print(filtered_standard[:10])"],"metadata":{"id":"B041nyJnISFJ","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1718542466153,"user_tz":-540,"elapsed":1852,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"bafb6241-5772-4872-b2c5-2fcb9e58e4d9"},"execution_count":16,"outputs":[{"output_type":"stream","name":"stdout","text":["['여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴', '장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴', '예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴', '이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네', '콩이파리는 가시가 있어가 꺼끄럽고 뻣뻣하고 묵어 보면 맛이 없어예', '여기에는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴', '여개는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴', '음식 먹으만 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니데이', '논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 무짜로 만지만 위험합니더', '딱꾹지를 멈치지도 않고 점들 하는디 이럴 때는 우예 해야 합니껴']\n","['여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까', '장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까', '예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까', '이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네', '콩잎은 가시가 있어서 껄끄럽고 뻣뻣하고 먹어 보면 맛이 없어요', '여기에는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까', '여기는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까', '음식 먹으면 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니다', '논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 함부로 만지면 위험합니다', '딱꾹지를 멈추지도 않고 점들 하는데 이럴 때는 어떻게 해야 합니까']\n"]}]},{"cell_type":"code","source":["print(len(filtered_dialect))\n","print(len(filtered_standard))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"_dUEMZ8HRPow","executionInfo":{"status":"ok","timestamp":1718542468332,"user_tz":-540,"elapsed":380,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"9478287a-88f0-4636-9c96-59d0e51d4aff"},"execution_count":17,"outputs":[{"output_type":"stream","name":"stdout","text":["169723\n","169723\n"]}]},{"cell_type":"code","source":["plt.hist(sentenceLengths(filtered_dialect), bins=10)\n","plt.xlabel('length of dialect')\n","plt.ylabel('number of dialect')\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"id":"yf8viS-nR3bN","executionInfo":{"status":"ok","timestamp":1718542470919,"user_tz":-540,"elapsed":928,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"fa38f81d-da41-42c6-d94f-012ca06d2ecb"},"execution_count":18,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["plt.hist(sentenceLengths(filtered_standard), bins=10)\n","plt.xlabel('length of standard')\n","plt.ylabel('number of standard')\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"id":"2g430fC7RliO","executionInfo":{"status":"ok","timestamp":1718542471337,"user_tz":-540,"elapsed":419,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"64597664-0dde-47eb-873a-796ac99f130e"},"execution_count":19,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["SOS_token = 0\n","EOS_token = 0\n","\n","class Lang:\n"," def __init__(self, name):\n"," self.name = name\n"," self.word2index = {}\n"," self.word2count = {}\n"," self.index2word = {0: \"SOS\", 1: \"EOS\"}\n"," self.n_words = 2 # SOS, EOS\n","\n"," def addSentence(self, sentence):\n"," for word in sentence.split(\" \"):\n"," self.addWord(word)\n","\n"," def addWord(self, word):\n"," if word not in self.word2index:\n"," self.word2index[word] = self.n_words\n"," self.word2count[word] = 1\n"," self.index2word[self.n_words] = word\n"," self.n_words += 1\n"," else:\n"," self.word2count[word] += 1"],"metadata":{"id":"oMl0xGNU49XX","executionInfo":{"status":"ok","timestamp":1718542471337,"user_tz":-540,"elapsed":2,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":20,"outputs":[]},{"cell_type":"code","source":["# Lang 객체 생성\n","dialect_lang = Lang(\"Dialect\")\n","standard_lang = Lang(\"Standard\")\n","\n","# 문장 추가\n","for sentence in filtered_dialect:\n"," dialect_lang.addSentence(sentence)\n","for sentence in filtered_standard:\n"," standard_lang.addSentence(sentence)\n","for sentence in filtered_df_VL['src']:\n"," dialect_lang.addSentence(sentence)\n","for sentence in filtered_df_VL['tar']:\n"," standard_lang.addSentence(sentence)\n","\n","# 문장\n","pairs = list(zip(filtered_dialect, filtered_standard))\n","VL_pairs = list(zip(filtered_df_VL['src'], filtered_df_VL['tar']))\n","\n","# 문장을 인덱스로 변환\n","def indexesFromSentence(lang, sentence):\n"," return [lang.word2index[word] for word in sentence.split(' ')]\n","\n","def tensorFromSentence(lang, sentence):\n"," indexes = indexesFromSentence(lang, sentence)\n"," indexes.append(EOS_token)\n"," if len(indexes) < max_len:\n"," indexes.extend([EOS_token] * (max_len - len(indexes))) # 패딩 추가\n"," return torch.tensor(indexes[:max_len], dtype=torch.long).view(-1, 1)\n","\n","def tensorsFromPair(pair):\n"," input_tensor = tensorFromSentence(dialect_lang, pair[0])\n"," target_tensor = tensorFromSentence(standard_lang, pair[1])\n"," return (input_tensor, target_tensor)"],"metadata":{"id":"VBPYjCbZ8l6k","executionInfo":{"status":"ok","timestamp":1718542475623,"user_tz":-540,"elapsed":2551,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":21,"outputs":[]},{"cell_type":"code","source":["import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","\n","# 검증 데이터를 인덱스로 변환\n","validation_input_tensors = [tensorFromSentence(dialect_lang, pair[0]) for pair in VL_pairs]\n","validation_target_tensors = [tensorFromSentence(standard_lang, pair[1]) for pair in VL_pairs]\n","\n","class EncoderRNN(nn.Module):\n"," def __init__(self, input_size, hidden_size):\n"," super(EncoderRNN, self).__init__()\n"," self.hidden_size = hidden_size\n"," self.embedding = nn.Embedding(input_size, hidden_size)\n"," self.lstm = nn.LSTM(hidden_size, hidden_size)\n","\n"," def forward(self, input, hidden):\n"," embedded = self.embedding(input).view(1, 1, -1)\n"," output, hidden = self.lstm(embedded, hidden)\n"," return output, hidden\n","\n"," def initHidden(self):\n"," return (torch.zeros(1, 1, self.hidden_size),\n"," torch.zeros(1, 1, self.hidden_size))\n","\n","class AttnDecoderRNN(nn.Module):\n"," def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=max_len):\n"," super(AttnDecoderRNN, self).__init__()\n"," self.hidden_size = hidden_size\n"," self.output_size = output_size\n"," self.dropout_p = dropout_p\n"," self.max_length = max_length\n","\n"," self.embedding = nn.Embedding(self.output_size, self.hidden_size)\n"," self.attn = nn.Linear(self.hidden_size * 2, self.max_length)\n"," self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)\n"," self.dropout = nn.Dropout(self.dropout_p)\n"," self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)\n"," self.out = nn.Linear(self.hidden_size, self.output_size)\n","\n"," def forward(self, input, hidden, encoder_outputs):\n"," embedded = self.embedding(input).view(1, 1, -1)\n"," embedded = self.dropout(embedded)\n","\n"," attn_weights = nn.functional.softmax(\n"," self.attn(torch.cat((embedded[0], hidden[0][0]), 1)), dim=1)\n"," attn_applied = torch.bmm(attn_weights.unsqueeze(0),\n"," encoder_outputs.unsqueeze(0))\n","\n"," output = torch.cat((embedded[0], attn_applied[0]), 1)\n"," output = self.attn_combine(output).unsqueeze(0)\n","\n"," output = nn.functional.relu(output)\n"," output, hidden = self.lstm(output, hidden)\n","\n"," output = nn.functional.log_softmax(self.out(output[0]), dim=1)\n"," return output, hidden, attn_weights\n","\n"," def initHidden(self):\n"," return (torch.zeros(1, 1, self.hidden_size),\n"," torch.zeros(1, 1, self.hidden_size))"],"metadata":{"id":"EyqODVGn87BL","executionInfo":{"status":"ok","timestamp":1718542480766,"user_tz":-540,"elapsed":5144,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":22,"outputs":[]},{"cell_type":"code","source":["import random\n","import time\n","import math\n","\n","def asMinutes(s):\n"," m = math.floor(s / 60)\n"," s -= m * 60\n"," return f'{m}m {s:.2f}s'\n","\n","def timeSince(since, percent):\n"," now = time.time()\n"," s = now - since\n"," es = s / (percent)\n"," rs = es - s\n"," return f'{asMinutes(s)} (- {asMinutes(rs)})'\n","\n","def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=max_len):\n"," encoder_hidden = encoder.initHidden()\n","\n"," encoder_optimizer.zero_grad()\n"," decoder_optimizer.zero_grad()\n","\n"," input_length = input_tensor.size(0)\n"," target_length = target_tensor.size(0)\n","\n"," encoder_outputs = torch.zeros(max_length, encoder.hidden_size)\n","\n"," loss = 0\n","\n"," for ei in range(input_length):\n"," encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)\n"," encoder_outputs[ei] = encoder_output[0, 0]\n","\n"," decoder_input = torch.tensor([[SOS_token]])\n","\n"," decoder_hidden = encoder_hidden\n","\n"," for di in range(target_length):\n"," decoder_output, decoder_hidden, decoder_attention = decoder(\n"," decoder_input, decoder_hidden, encoder_outputs)\n"," topv, topi = decoder_output.topk(1)\n"," decoder_input = topi.squeeze().detach() # 다음 입력으로 사용\n","\n"," loss += criterion(decoder_output, target_tensor[di])\n"," if decoder_input.item() == EOS_token:\n"," break\n","\n"," loss.backward()\n","\n"," encoder_optimizer.step()\n"," decoder_optimizer.step()\n","\n"," return loss.item() / target_length\n","\n","def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):\n"," start = time.time()\n"," plot_losses = []\n"," print_loss_total = 0\n"," plot_loss_total = 0\n","\n"," encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)\n"," decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)\n"," training_pairs = [tensorsFromPair(random.choice(pairs)) for _ in range(n_iters)]\n"," criterion = nn.NLLLoss()\n","\n"," for iter in range(1, n_iters + 1):\n"," training_pair = training_pairs[iter - 1]\n"," input_tensor = training_pair[0]\n"," target_tensor = training_pair[1]\n","\n"," loss = train(input_tensor, target_tensor, encoder,\n"," decoder, encoder_optimizer, decoder_optimizer, criterion)\n"," print_loss_total += loss\n"," plot_loss_total += loss\n","\n"," if iter % print_every == 0:\n"," print_loss_avg = print_loss_total / print_every\n"," print_loss_total = 0\n"," print(f'{timeSince(start, iter / n_iters)} ({iter} {iter / n_iters * 100:.2f}%) {print_loss_avg:.4f}')\n","\n"," if iter % plot_every == 0:\n"," plot_loss_avg = plot_loss_total / plot_every\n"," plot_losses.append(plot_loss_avg)\n"," plot_loss_total = 0\n","\n"," showPlot(plot_losses)\n","\n","def showPlot(points):\n"," plt.figure()\n"," plt.plot(points)\n"," plt.title('Training Loss')\n"," plt.xlabel('Iterations')\n"," plt.ylabel('Loss')\n"," plt.show()"],"metadata":{"id":"Uaozw3dc_vdk","executionInfo":{"status":"ok","timestamp":1718542482762,"user_tz":-540,"elapsed":378,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":23,"outputs":[]},{"cell_type":"code","source":["# 모델 초기화 및 훈련\n","hidden_size = 256\n","encoder = EncoderRNN(dialect_lang.n_words, hidden_size)\n","decoder = AttnDecoderRNN(hidden_size, standard_lang.n_words, dropout_p=0.1)\n","\n","trainIters(encoder, decoder, 1000, print_every=100, plot_every=50) # 적은 수의 iteration으로 실행"],"metadata":{"id":"JLgmcaB5UKtN","colab":{"base_uri":"https://localhost:8080/","height":656},"executionInfo":{"status":"ok","timestamp":1718546219515,"user_tz":-540,"elapsed":3734744,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"bbd58d91-3893-4be7-d31e-f26b407723fb"},"execution_count":24,"outputs":[{"output_type":"stream","name":"stdout","text":["6m 1.12s (- 54m 10.07s) (100 10.00%) 1.4042\n","12m 43.51s (- 50m 54.02s) (200 20.00%) 2.2098\n","19m 8.74s (- 44m 40.40s) (300 30.00%) 1.8428\n","24m 53.66s (- 37m 20.50s) (400 40.00%) 0.8409\n","30m 32.42s (- 30m 32.42s) (500 50.00%) 0.8369\n","36m 22.89s (- 24m 15.26s) (600 60.00%) 0.9774\n","42m 28.65s (- 18m 12.28s) (700 70.00%) 1.2321\n","49m 3.86s (- 12m 15.96s) (800 80.00%) 1.8149\n","55m 38.35s (- 6m 10.93s) (900 90.00%) 1.7333\n","62m 13.26s (- 0m 0.00s) (1000 100.00%) 1.7171\n"]},{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["def saveModel(encoder, decoder, encoder_path='encoder.pth', decoder_path='decoder.pth'): ## 모델 저장\n"," torch.save(encoder.state_dict(), encoder_path)\n"," torch.save(decoder.state_dict(), decoder_path)\n","\n","def loadModel(encoder_path='encoder.pth', decoder_path='decoder.pth'): ## 모델 로드\n"," encoder = EncoderRNN(dialect_lang.n_words, hidden_size)\n"," decoder = AttnDecoderRNN(hidden_size, standard_lang.n_words, dropout_p=0.1)\n"," encoder.load_state_dict(torch.load(encoder_path))\n"," decoder.load_state_dict(torch.load(decoder_path))\n"," return encoder, decoder"],"metadata":{"id":"_Gjpck4MUHBm","executionInfo":{"status":"ok","timestamp":1718546220678,"user_tz":-540,"elapsed":2,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":25,"outputs":[]},{"cell_type":"code","source":["encoder_path = '/content/drive/MyDrive/LSTM+attention/test_encoder.pth'\n","decoder_path = '/content/drive/MyDrive/LSTM+attention/test_decoder.pth'\n","\n","saveModel(encoder, decoder, encoder_path, decoder_path)"],"metadata":{"id":"TqE3502bKBs5","executionInfo":{"status":"ok","timestamp":1718546227156,"user_tz":-540,"elapsed":6479,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":26,"outputs":[]},{"cell_type":"code","source":["# 테스트 함수\n","\n","def evaluate(encoder, decoder, sentence, max_length=max_len):\n"," with torch.no_grad():\n"," input_tensor = tensorFromSentence(dialect_lang, sentence)\n"," input_length = input_tensor.size()[0]\n"," encoder_hidden = encoder.initHidden()\n","\n"," encoder_outputs = torch.zeros(max_length, encoder.hidden_size)\n","\n"," for ei in range(input_length):\n"," encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)\n"," encoder_outputs[ei] = encoder_output[0, 0]\n","\n"," decoder_input = torch.tensor([[SOS_token]]) # SOS token\n"," decoder_hidden = encoder_hidden\n","\n"," decoded_words = []\n"," decoder_attentions = torch.zeros(max_length, max_length)\n","\n"," for di in range(max_length):\n"," decoder_output, decoder_hidden, decoder_attention = decoder(\n"," decoder_input, decoder_hidden, encoder_outputs)\n"," decoder_attentions[di] = decoder_attention.data\n"," topv, topi = decoder_output.data.topk(1)\n"," if topi.item() == EOS_token:\n"," decoded_words.append('')\n"," break\n"," else:\n"," decoded_words.append(standard_lang.index2word[topi.item()])\n","\n"," decoder_input = topi.squeeze().detach()\n","\n"," return decoded_words\n","\n","def evaluateRandomly(encoder, decoder, n=10):\n"," for i in range(n):\n"," pair = random.choice(test_pairs)\n"," print('Dialect:', pair[0])\n"," print('Expected:', pair[1])\n"," output_words = evaluate(encoder, decoder, pair[0])\n"," output_sentence = ' '.join(output_words)\n"," print('Predicted:', output_sentence)\n"," print('')\n"],"metadata":{"id":"zYySN_5AUvbG"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["## 테스트 데이터 준비 필요\n","test_dialect_sentences = []\n","test_standard_sentences = []\n","\n","test_pairs = list(zip(test_dialect_sentences, test_standard_sentences))"],"metadata":{"id":"ch8xAa69U5DA"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 저장된 모델 불러오기\n","encoder = EncoderRNN(dialect_lang.n_words, hidden_size)\n","decoder = AttnDecoderRNN(hidden_size, standard_lang.n_words, dropout_p=0.1)\n","\n","loadModel(encoder, decoder, encoder_path, decoder_path)"],"metadata":{"id":"THo_PKRYM4vP"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["## 테스트 함수 실행\n","evaluateRandomly(encoder, decoder, n=len(test_pairs))"],"metadata":{"id":"JQNbhsGTVRCe"},"execution_count":null,"outputs":[]}]} \ No newline at end of file From 63a7dbce737ba512089e4010c17be43cbfc455ec Mon Sep 17 00:00:00 2001 From: edcrfv458 Date: Mon, 17 Jun 2024 05:09:25 +0900 Subject: [PATCH 3/3] =?UTF-8?q?Create:=20=ED=85=8C=EC=8A=A4=ED=8A=B8=20?= =?UTF-8?q?=ED=8C=8C=EC=9D=BC=20=EC=83=9D=EC=84=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit validation data에 대한 Loss 출려과 사전에 없는 단어가 들어오는 경우 처리 부분 추가 --- AI/LSTM_attention_test.ipynb | 1681 ++++++++++++++++++++++++++++++++++ 1 file changed, 1681 insertions(+) create mode 100644 AI/LSTM_attention_test.ipynb diff --git a/AI/LSTM_attention_test.ipynb b/AI/LSTM_attention_test.ipynb new file mode 100644 index 0000000..47e6cd6 --- /dev/null +++ b/AI/LSTM_attention_test.ipynb @@ -0,0 +1,1681 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ix1Jbg_xWTxg", + "outputId": "2b5f0a6a-2634-47a4-e269-b72e64d74705" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" + ] + } + ], + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import json\n", + "import csv\n", + "\n", + "TL_sentence_path = '/content/drive/MyDrive/LSTM+attention/sentence_dataTL.csv'\n", + "VL_sentence_path = '/content/drive/MyDrive/LSTM+attention/sentence_dataVL.csv'\n", + "\n", + "# data파일 불러오기\n", + "TL_sentence_data = pd.read_csv(TL_sentence_path, encoding='utf-8')\n", + "VL_sentence_data = pd.read_csv(VL_sentence_path, encoding='utf-8')\n", + "\n", + "# 중복 제거, Pronuncication 열은 필요 없다고 생각\n", + "TL_sentence_data.drop('Pronunciation', axis=1, inplace=True)\n", + "TL_sentence_data = TL_sentence_data.drop_duplicates().reset_index(drop=True)\n", + "VL_sentence_data.drop('Pronunciation', axis=1, inplace=True)\n", + "VL_sentence_data = VL_sentence_data.drop_duplicates().reset_index(drop=True)" + ], + "metadata": { + "id": "xPCQBU1BWfcw" + }, + "execution_count": 40, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "TL_sentence_data[:5]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "UZGjs0aPXrCe", + "outputId": "b61a98c7-d756-401d-dec3-e4cbea4a68c4" + }, + "execution_count": 41, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Dialect \\\n", + "0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴 \n", + "1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴 \n", + "2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴 \n", + "3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까 \n", + "4 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네 \n", + "\n", + " Standard \n", + "0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n", + "1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까 \n", + "2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까 \n", + "3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까 \n", + "4 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DialectStandard
0여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까
1장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까
2예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까
3음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까
4이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"TL_sentence_data[:5]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Dialect\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\uc7a5\\ub840 \\uad00\\ub828\\ud574\\uc11c \\ucd08\\uc0c1\\uc9d1 \\uac19\\uc740 \\ub370 \\uac00\\uac00 \\ud558\\uc9c0 \\ub9d0\\uc544\\uc57c \\ub370\\ub294 \\uae30 \\uc788\\uc2b5\\ub2c8\\uaef4\",\n \"\\uc774 \\uad6c\\ub450 \\ud558\\ub098\\ub9cc \\uacc4\\uc18d \\uc2e0\\uace0 \\ub315\\uae30\\uc774\\uaebc\\ub124 \\uc778\\uc790 \\uad7d\\uc774 \\ub9ce\\uc774 \\ub2f3\\uc544\\uc11c \\uac08\\uc544\\uc57c \\ub418\\uaca0\\ub124\",\n \"\\uc608\\uc804\\uc5d0\\ub294 \\uc9d1 \\uc548\\uc5d0\\uc11c \\uc5ec\\uc790\\ub4e4\\uc774 \\ub0a8\\uc790 \\uc704\\ub85c \\ub760\\ub118\\uc73c\\uba74 \\uc548 \\ub374\\ub2e4 \\ucea4\\uc2b5\\ub2c8\\uaef4\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Standard\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\uc7a5\\ub840 \\uad00\\ub828\\ud574\\uc11c \\ucd08\\uc0c1\\uc9d1 \\uac19\\uc740 \\ub370 \\uac00\\uc11c \\ud558\\uc9c0 \\ub9d0\\uc544\\uc57c \\ub370\\ub294 \\uac8c \\uc788\\uc2b5\\ub2c8\\uae4c\",\n \"\\uc774 \\uad6c\\ub450 \\ud558\\ub098\\ub9cc \\uacc4\\uc18d \\uc2e0\\uace0 \\ub2e4\\ub2c8\\ub2c8\\uae4c \\uc774\\uc81c \\uad7d\\uc774 \\ub9ce\\uc774 \\ub2f3\\uc544\\uc11c \\uac08\\uc544\\uc57c \\ub418\\uaca0\\ub124\",\n \"\\uc608\\uc804\\uc5d0\\ub294 \\uc9d1 \\uc548\\uc5d0\\uc11c \\uc5ec\\uc790\\ub4e4\\uc774 \\ub0a8\\uc790 \\uc704\\ub85c \\ub6f0\\uc5b4\\ub118\\uc73c\\uba74 \\uc548 \\ub41c\\ub2e4 \\ud588\\uc2b5\\ub2c8\\uae4c\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 41 + } + ] + }, + { + "cell_type": "code", + "source": [ + "VL_sentence_data[:5]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "a0cWFdpxDKN7", + "outputId": "d47ecea6-b4fc-4d39-fd3c-d1d7369da953" + }, + "execution_count": 42, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Dialect \\\n", + "0 오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요 \n", + "1 혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다 \n", + "2 집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼 \n", + "3 아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이 \n", + "4 옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로 \n", + "\n", + " Standard \n", + "0 오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요 \n", + "1 혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다 \n", + "2 집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼 \n", + "3 아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐 \n", + "4 옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DialectStandard
0오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요
1혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다
2집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼
3아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐
4옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"VL_sentence_data[:5]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Dialect\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\ud608\\uc555\\uc57d\\uc740 \\uc2dc\\uac04\\uc744 \\ub9de\\ucdb0 \\ucc59\\uaca8 \\ub4dc\\uc154\\uc57c\\uc9c0 \\uc548 \\uadf8\\ub7ec\\uba74 \\ud6a8\\uacfc\\uac00 \\uc5c6\\uc2b5\\ub2c8\\ub2e4\",\n \"\\uc61b\\ub0a0\\ubd80\\ud130 \\uc870\\uc0c1\\uafc8\\uc774\\ub098 \\ub3fc\\uc9c0\\uafc8 \\uafb8\\ub9cc \\uc9d1\\uc5d0 \\ub3c8 \\ub9ce\\uc774 \\ub4e4\\uc5b4\\uc628\\ub2e4\\uace0 \\uc88b\\uc544 \\ud574\\uc9c0\\ub85c\",\n \"\\uc9d1\\uc5d0 \\ub3cc\\uc544\\uc640 \\ubcf4\\uc774\\uaebc\\ub124 \\ubb38\\uc774 \\uc5f4\\ub824 \\uc788\\uace0 \\ubf08\\ub2e4\\uc9c0\\uac00 \\uc5f4\\uc5b4\\ub454 \\ub3c8 \\uc804\\ubd80 \\uc5c6\\uc5b4\\uc9c0\\ub358 \\uc5b4\\uc774\\ub5bc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Standard\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\ud608\\uc555\\uc57d\\uc740 \\uc2dc\\uac04\\uc744 \\ub9de\\ucdb0 \\ucc59\\uaca8 \\ub4dc\\uc154\\uc57c\\uc9c0 \\uc548 \\uadf8\\ub7ec\\uba74 \\ud6a8\\uacfc\\uac00 \\uc5c6\\uc2b5\\ub2c8\\ub2e4\",\n \"\\uc61b\\ub0a0\\ubd80\\ud130 \\uc870\\uc0c1\\uafc8\\uc774\\ub098 \\ub3fc\\uc9c0\\uafc8 \\uafb8\\uba74 \\uc9d1\\uc5d0 \\ub3c8 \\ub9ce\\uc774 \\ub4e4\\uc5b4\\uc628\\ub2e4\\uace0 \\uc88b\\uc544 \\ud588\\uc8e0\",\n \"\\uc9d1\\uc5d0 \\ub3cc\\uc544\\uc640 \\ubcf4\\ub2c8\\uae4c \\ubb38\\uc774 \\uc5f4\\ub824 \\uc788\\uace0 \\uc11c\\ub78d\\uc774 \\uc5f4\\uc5b4\\ub454 \\ub3c8 \\uc804\\ubd80 \\uc5c6\\uc5b4\\uc9c0\\ub358 \\uc5b4\\uc774\\ub5bc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 42 + } + ] + }, + { + "cell_type": "code", + "source": [ + "standard_sentences_TL = TL_sentence_data['Standard']\n", + "dialect_sentences_TL = TL_sentence_data['Dialect']\n", + "standard_sentences_VL = VL_sentence_data['Standard']\n", + "dialect_sentences_VL = VL_sentence_data['Dialect']" + ], + "metadata": { + "id": "jlxCy4d3WyDB" + }, + "execution_count": 43, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "standard_sentences_TL[:5]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aj-awCcGFReV", + "outputId": "4cf3e645-11c4-4450-9615-0926f2306167" + }, + "execution_count": 44, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까\n", + "1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까\n", + "2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까\n", + "3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까\n", + "4 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네\n", + "Name: Standard, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 44 + } + ] + }, + { + "cell_type": "code", + "source": [ + "dialect_sentences_TL[:5]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ctrb-c6hFWAX", + "outputId": "6340fc72-eb4e-4ab8-9757-cabea9479748" + }, + "execution_count": 45, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴\n", + "1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴\n", + "2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴\n", + "3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까\n", + "4 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네\n", + "Name: Dialect, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 45 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# 학습 데이터 중에서 겹치는 표준어 문장과 방언 문장 제거\n", + "filtered_data_TR = {\n", + " \"src\": [],\n", + " \"tar\": []\n", + "}\n", + "\n", + "for i in range(0, len(dialect_sentences_TL)):\n", + " if (standard_sentences_TL[i] != dialect_sentences_TL[i]):\n", + " filtered_data_TR[\"src\"].append(dialect_sentences_TL[i])\n", + " filtered_data_TR[\"tar\"].append(standard_sentences_TL[i])\n", + "\n", + "filtered_df_TR = pd.DataFrame(filtered_data_TR)\n", + "\n", + "print(filtered_df_TR[:10])\n", + "print(len(filtered_df_TR))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GEIz3cMTXc76", + "outputId": "094b4095-e589-401a-ea7b-1d882bdd5963" + }, + "execution_count": 46, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " src \\\n", + "0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴 \n", + "1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴 \n", + "2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴 \n", + "3 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네 \n", + "4 콩이파리는 가시가 있어가 꺼끄럽고 뻣뻣하고 묵어 보면 맛이 없어예 \n", + "5 여기에는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴 \n", + "6 여개는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴 \n", + "7 음식 먹으만 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니데이 \n", + "8 논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 무짜로 만지만 위험합니더 \n", + "9 딱꾹지를 멈치지도 않고 점들 하는디 이럴 때는 우예 해야 합니껴 \n", + "\n", + " tar \n", + "0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n", + "1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까 \n", + "2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까 \n", + "3 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네 \n", + "4 콩잎은 가시가 있어서 껄끄럽고 뻣뻣하고 먹어 보면 맛이 없어요 \n", + "5 여기에는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n", + "6 여기는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n", + "7 음식 먹으면 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니다 \n", + "8 논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 함부로 만지면 위험합니다 \n", + "9 딱꾹지를 멈추지도 않고 점들 하는데 이럴 때는 어떻게 해야 합니까 \n", + "211878\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# 검증 데이터 중에서 겹치는 표준어 문장과 방언 문장 제거\n", + "filtered_data_VL = {\n", + " \"src\": [],\n", + " \"tar\": []\n", + "}\n", + "\n", + "for i in range(0, len(dialect_sentences_VL)):\n", + " if (standard_sentences_VL[i] != dialect_sentences_VL[i]):\n", + " filtered_data_VL[\"src\"].append(dialect_sentences_VL[i])\n", + " filtered_data_VL[\"tar\"].append(standard_sentences_VL[i])\n", + "\n", + "filtered_df_VL = pd.DataFrame(filtered_data_VL)\n", + "\n", + "print(filtered_df_VL[:10])\n", + "print(len(filtered_df_VL))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kv006ObsFwYF", + "outputId": "98d2caf3-6d41-46d2-d33c-871d77841043" + }, + "execution_count": 47, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " src \\\n", + "0 오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요 \n", + "1 집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼 \n", + "2 아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이 \n", + "3 옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로 \n", + "4 게얼에 먹을 채소나 과일 같은 것은 어데 보관을 했습니꺼 \n", + "5 촌구숙이라 젊은 사람들은 함부레 없고 전부 노인들만 있으이꺼네 농사 짓기가 힘들어요 \n", + "6 촌구석이라 젊은 사람들은 한 번이 없고 전부 노인들만 있으니까네 농사 짓기가 힘들어요 \n", + "7 소도 사람맨치로 잘 먹어야 근육도 붙고 심도 생겨서 일을 잘 하지로 \n", + "8 소도 사람 맨치로 잘 먹어야 근육도 붇고 힘도 생겨서 일을 잘 하지요 \n", + "9 옷가심을 짜를 때는 미리 선을 끟어 놓아야 쪽바리 잘 자를 수 있어예 \n", + "\n", + " tar \n", + "0 오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요 \n", + "1 집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼 \n", + "2 아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐 \n", + "3 옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠 \n", + "4 겨울에 먹을 채소나 과일 같은 것은 어디에 보관을 했습니까 \n", + "5 촌구석이라 젊은 사람들은 아예 없고 전부 노인들만 있으니까 농사 짓기가 힘들어요 \n", + "6 촌구석이라 젊은 사람들은 한 번이 없고 전부 노인들만 있으니까 농사 짓기가 힘들어요 \n", + "7 소도 사람처럼 잘 먹어야 근육도 붙고 힘도 생겨서 일을 잘 하지요 \n", + "8 소도 사람 처럼 잘 먹어야 근육도 붇고 힘도 생겨서 일을 잘 하지요 \n", + "9 옷감을 자를 때는 미리 선을 그어 놓아야 똑바로 잘 자를 수 있어요 \n", + "27509\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# 문장 길이 계산\n", + "def sentenceLengths(sentences):\n", + " return [len(sentence.split(' ')) for sentence in sentences]" + ], + "metadata": { + "id": "OFCJmuqdOo6m" + }, + "execution_count": 48, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "plt.hist(sentenceLengths(filtered_data_TR['src']), bins=10)\n", + "plt.xlabel('length of dialect')\n", + "plt.ylabel('number of dialect')\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 449 + }, + "id": "-zqR5FSPpN3X", + "outputId": "8c626b82-8e48-4504-f14a-98261e591f3f" + }, + "execution_count": 49, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "plt.hist(sentenceLengths(filtered_data_TR['tar']), bins=10)\n", + "plt.xlabel('length of standard')\n", + "plt.ylabel('number of standard')\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 449 + }, + "id": "wET-0eUhp2Vv", + "outputId": "af7c7c24-db11-43ad-a5e7-2a8f2e376979" + }, + "execution_count": 50, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAGwCAYAAADhf7JcAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABF1UlEQVR4nO3de1yVZb7///cCBTwBKsqhUDHNw6h4DLE8jYx4mIpyZnuapLLMtmfU1MlInXY4umvUraPbasS9RydrxrBRwxBP35JQUfIwwqiDUqML8gBLUFHh/v3Rz3u7BlRWLbwVX8/HYz3ivq/Pfa/PulR4d697XdgMwzAEAACAu8rD6gYAAAAeRIQwAAAACxDCAAAALEAIAwAAsAAhDAAAwAKEMAAAAAsQwgAAACxQw+oGHiRlZWU6ffq06tWrJ5vNZnU7AACgEgzD0MWLFxUSEiIPD/ddvyKE3UWnT59WaGio1W0AAIAf4JtvvtHDDz/stvMRwu6ievXqSfr+D9HX19fibgAAQGU4HA6FhoaaP8fdhRB2F914C9LX15cQBgDAfcbdtxJxYz4AAIAFCGEAAAAWIIQBAABYgBAGAABgAUIYAACABQhhAAAAFiCEAQAAWIAQBgAAYAFCGAAAgAUIYQAAABYghAEAAFiAEAYAAGABS0PYrl279OSTTyokJEQ2m01JSUlO4zabrcLHwoULzZpmzZqVG58/f77TeQ4ePKiePXvKx8dHoaGhWrBgQblePv74Y7Vu3Vo+Pj5q3769Nm/e7DRuGIbi4+MVHBysWrVqKSoqSseOHXPfZAAAgAeKpSGsuLhY4eHhWrZsWYXjZ86ccXr84Q9/kM1m05AhQ5zq5s2b51Q3YcIEc8zhcKh///5q2rSpMjIytHDhQs2ZM0crV640a3bv3q3hw4dr9OjROnDggGJiYhQTE6PDhw+bNQsWLNCSJUu0YsUKpaenq06dOoqOjtaVK1fcPCsAAOBBYDMMw7C6Cen7q16ffPKJYmJiblkTExOjixcvKjU11dzXrFkzTZ48WZMnT67wmOXLl+v111+X3W6Xl5eXJGnmzJlKSkpSVlaWJGno0KEqLi7Wxo0bzeO6d++ujh07asWKFTIMQyEhIZo6daqmTZsmSSosLFRgYKASExM1bNiwCp+7pKREJSUl5rbD4VBoaKgKCwvl6+tbqXkBAADWcjgc8vPzc/vP7xpuO1MVy8vL06ZNm7R69epyY/Pnz9dvfvMbNWnSRCNGjNCUKVNUo8b3Ly0tLU29evUyA5gkRUdH67e//a0uXLig+vXrKy0tTXFxcU7njI6ONt8ezcnJkd1uV1RUlDnu5+eniIgIpaWl3TKEJSQkaO7cuT/2pVdKs5mb7srzuNvJ+YOtbgEAAEvcNzfmr169WvXq1dOzzz7rtH/ixIn68MMPtX37dr3yyit6++239dprr5njdrtdgYGBTsfc2Lbb7betuXn85uMqqqnIrFmzVFhYaD6++eYbV14yAACoxu6bK2F/+MMfNHLkSPn4+Djtv/kKVocOHeTl5aVXXnlFCQkJ8vb2vtttOvH29ra8BwAAcG+6L66E/b//9/+UnZ2tl1566Y61ERERun79uk6ePClJCgoKUl5enlPNje2goKDb1tw8fvNxFdUAAAC44r4IYR988IG6dOmi8PDwO9ZmZmbKw8NDjRs3liRFRkZq165dunbtmlmTkpKiVq1aqX79+mbNzTf736iJjIyUJIWFhSkoKMipxuFwKD093awBAABwhaVvRxYVFen48ePmdk5OjjIzM9WgQQM1adJE0vdh5+OPP9Y777xT7vi0tDSlp6erb9++qlevntLS0jRlyhT96le/MgPWiBEjNHfuXI0ePVozZszQ4cOHtXjxYv3ud78zzzNp0iT17t1b77zzjgYPHqwPP/xQ+/btM5exsNlsmjx5st566y21bNlSYWFheuONNxQSEnLbT3MCAADciqUhbN++ferbt6+5feP+rtjYWCUmJkqSPvzwQxmGoeHDh5c73tvbWx9++KHmzJmjkpIShYWFacqUKU73ifn5+enzzz/XuHHj1KVLFwUEBCg+Pl5jxowxa3r06KG1a9dq9uzZ+vWvf62WLVsqKSlJ7dq1M2tee+01FRcXa8yYMSooKNATTzyh5OTkcveoAQAAVMY9s07Yg6Cq1hmRWKICAICqUlU/v++Le8IAAACqG0IYAACABQhhAAAAFiCEAQAAWIAQBgAAYAFCGAAAgAUIYQAAABYghAEAAFiAEAYAAGABQhgAAIAFCGEAAAAWIIQBAABYgBAGAABgAUIYAACABQhhAAAAFiCEAQAAWIAQBgAAYAFCGAAAgAUIYQAAABYghAEAAFiAEAYAAGABQhgAAIAFCGEAAAAWIIQBAABYgBAGAABgAUIYAACABQhhAAAAFiCEAQAAWIAQBgAAYAFCGAAAgAUIYQAAABYghAEAAFiAEAYAAGABQhgAAIAFCGEAAAAWIIQBAABYgBAGAABgAUIYAACABQhhAAAAFrA0hO3atUtPPvmkQkJCZLPZlJSU5DT+/PPPy2azOT0GDBjgVHP+/HmNHDlSvr6+8vf31+jRo1VUVORUc/DgQfXs2VM+Pj4KDQ3VggULyvXy8ccfq3Xr1vLx8VH79u21efNmp3HDMBQfH6/g4GDVqlVLUVFROnbsmHsmAgAAPHAsDWHFxcUKDw/XsmXLblkzYMAAnTlzxnz86U9/chofOXKkjhw5opSUFG3cuFG7du3SmDFjzHGHw6H+/furadOmysjI0MKFCzVnzhytXLnSrNm9e7eGDx+u0aNH68CBA4qJiVFMTIwOHz5s1ixYsEBLlizRihUrlJ6erjp16ig6OlpXrlxx44wAAIAHhc0wDMPqJiTJZrPpk08+UUxMjLnv+eefV0FBQbkrZDccPXpUbdu21d69e9W1a1dJUnJysgYNGqRvv/1WISEhWr58uV5//XXZ7XZ5eXlJkmbOnKmkpCRlZWVJkoYOHari4mJt3LjRPHf37t3VsWNHrVixQoZhKCQkRFOnTtW0adMkSYWFhQoMDFRiYqKGDRtWqdfocDjk5+enwsJC+fr6ujpFt9Vs5ia3nu9uOTl/sNUtAABwW1X18/uevydsx44daty4sVq1aqVXX31V586dM8fS0tLk7+9vBjBJioqKkoeHh9LT082aXr16mQFMkqKjo5Wdna0LFy6YNVFRUU7PGx0drbS0NElSTk6O7Ha7U42fn58iIiLMmoqUlJTI4XA4PQAAAKR7PIQNGDBA//M//6PU1FT99re/1c6dOzVw4ECVlpZKkux2uxo3bux0TI0aNdSgQQPZ7XazJjAw0Knmxvadam4ev/m4imoqkpCQID8/P/MRGhrq0usHAADVVw2rG7idm9/ma9++vTp06KBHHnlEO3bsUL9+/SzsrHJmzZqluLg4c9vhcBDEAACApHv8Sti/at68uQICAnT8+HFJUlBQkPLz851qrl+/rvPnzysoKMisycvLc6q5sX2nmpvHbz6uopqKeHt7y9fX1+kBAAAg3Wch7Ntvv9W5c+cUHBwsSYqMjFRBQYEyMjLMmm3btqmsrEwRERFmza5du3Tt2jWzJiUlRa1atVL9+vXNmtTUVKfnSklJUWRkpCQpLCxMQUFBTjUOh0Pp6elmDQAAgCssDWFFRUXKzMxUZmampO9vgM/MzFRubq6Kioo0ffp0ffXVVzp58qRSU1P19NNPq0WLFoqOjpYktWnTRgMGDNDLL7+sPXv26Msvv9T48eM1bNgwhYSESJJGjBghLy8vjR49WkeOHNG6deu0ePFip7cJJ02apOTkZL3zzjvKysrSnDlztG/fPo0fP17S95/cnDx5st566y19+umnOnTokEaNGqWQkBCnT3MCAABUlqX3hO3bt099+/Y1t28Eo9jYWC1fvlwHDx7U6tWrVVBQoJCQEPXv31+/+c1v5O3tbR6zZs0ajR8/Xv369ZOHh4eGDBmiJUuWmON+fn76/PPPNW7cOHXp0kUBAQGKj493WkusR48eWrt2rWbPnq1f//rXatmypZKSktSuXTuz5rXXXlNxcbHGjBmjgoICPfHEE0pOTpaPj09VThEAAKim7pl1wh4ErBNWHuuEAQDudQ/sOmEAAADVESEMAADAAoQwAAAACxDCAAAALEAIAwAAsAAhDAAAwAKEMAAAAAsQwgAAACxACAMAALAAIQwAAMAChDAAAAALEMIAAAAsQAgDAACwACEMAADAAoQwAAAACxDCAAAALEAIAwAAsAAhDAAAwAKEMAAAAAsQwgAAACxACAMAALAAIQwAAMAChDAAAAALEMIAAAAsQAgDAACwACEMAADAAoQwAAAACxDCAAAALEAIAwAAsAAhDAAAwAKEMAAAAAsQwgAAACxACAMAALAAIQwAAMAChDAAAAALEMIAAAAsQAgDAACwACEMAADAAoQwAAAAC1gawnbt2qUnn3xSISEhstlsSkpKMseuXbumGTNmqH379qpTp45CQkI0atQonT592ukczZo1k81mc3rMnz/fqebgwYPq2bOnfHx8FBoaqgULFpTr5eOPP1br1q3l4+Oj9u3ba/PmzU7jhmEoPj5ewcHBqlWrlqKionTs2DH3TQYAAHigWBrCiouLFR4ermXLlpUbu3Tpkvbv36833nhD+/fv1/r165Wdna2nnnqqXO28efN05swZ8zFhwgRzzOFwqH///mratKkyMjK0cOFCzZkzRytXrjRrdu/ereHDh2v06NE6cOCAYmJiFBMTo8OHD5s1CxYs0JIlS7RixQqlp6erTp06io6O1pUrV9w8KwAA4EFgMwzDsLoJSbLZbPrkk08UExNzy5q9e/fqscce06lTp9SkSRNJ318Jmzx5siZPnlzhMcuXL9frr78uu90uLy8vSdLMmTOVlJSkrKwsSdLQoUNVXFysjRs3msd1795dHTt21IoVK2QYhkJCQjR16lRNmzZNklRYWKjAwEAlJiZq2LBhlXqNDodDfn5+KiwslK+vb6WOqaxmMze59Xx3y8n5g61uAQCA26qqn9/31T1hhYWFstls8vf3d9o/f/58NWzYUJ06ddLChQt1/fp1cywtLU29evUyA5gkRUdHKzs7WxcuXDBroqKinM4ZHR2ttLQ0SVJOTo7sdrtTjZ+fnyIiIsyaipSUlMjhcDg9AAAAJKlGZYrq168vm81WqROeP3/+RzV0K1euXNGMGTM0fPhwpxQ6ceJEde7cWQ0aNNDu3bs1a9YsnTlzRu+++64kyW63KywszOlcgYGB5lj9+vVlt9vNfTfX2O12s+7m4yqqqUhCQoLmzp37A18xAACozioVwhYtWmR+fe7cOb311luKjo5WZGSkpO+vJG3ZskVvvPFGlTR57do1/du//ZsMw9Dy5cudxuLi4syvO3ToIC8vL73yyitKSEiQt7d3lfRTWbNmzXLqz+FwKDQ01MKOAADAvaJSISw2Ntb8esiQIZo3b57Gjx9v7ps4caKWLl2qrVu3asqUKW5t8EYAO3XqlLZt23bH92IjIiJ0/fp1nTx5Uq1atVJQUJDy8vKcam5sBwUFmf+tqObm8Rv7goODnWo6dux4y168vb0tD4IAAODe5PI9YVu2bNGAAQPK7R8wYIC2bt3qlqZuuBHAjh07pq1bt6phw4Z3PCYzM1MeHh5q3LixJCkyMlK7du3StWvXzJqUlBS1atVK9evXN2tSU1OdzpOSkmJe6QsLC1NQUJBTjcPhUHp6ulkDAADgCpdDWMOGDbVhw4Zy+zds2FCpkHSzoqIiZWZmKjMzU9L3N8BnZmYqNzdX165d0y9+8Qvt27dPa9asUWlpqex2u+x2u65evSrp+7dBFy1apK+//lr/+Mc/tGbNGk2ZMkW/+tWvzIA1YsQIeXl5afTo0Tpy5IjWrVunxYsXO71NOGnSJCUnJ+udd95RVlaW5syZo3379plX+2w2myZPnqy33npLn376qQ4dOqRRo0YpJCTktp/mBAAAuJVKvR15s7lz5+qll17Sjh07FBERIUlKT09XcnKy3nvvPZfOtW/fPvXt29fcvhGMYmNjNWfOHH366aeSVO4tv+3bt6tPnz7y9vbWhx9+qDlz5qikpERhYWGaMmWKU8Dy8/PT559/rnHjxqlLly4KCAhQfHy8xowZY9b06NFDa9eu1ezZs/XrX/9aLVu2VFJSktq1a2fWvPbaayouLtaYMWNUUFCgJ554QsnJyfLx8XHpNQMAAEg/cJ2w9PR0LVmyREePHpUktWnTRhMnTjRDGSrGOmHlsU4YAOBeV1U/v126Enbt2jW98soreuONN7RmzRq3NQEAAPCgcemesJo1a+ovf/lLVfUCAADwwHD5xvyYmBinX7QNAAAA17l8Y37Lli01b948ffnll+rSpYvq1KnjND5x4kS3NQcAAFBduRzCPvjgA/n7+ysjI0MZGRlOYzabjRAGAABQCS6HsJycnKroAwAA4IHi8j1hAAAA+PFcvhImSd9++60+/fRT5ebmmqvX3/Duu++6pTEAAIDqzOUQlpqaqqeeekrNmzdXVlaW2rVrp5MnT8owDHXu3LkqegQAAKh2XH47ctasWZo2bZoOHTokHx8f/eUvf9E333yj3r1765e//GVV9AgAAFDtuBzCjh49qlGjRkmSatSoocuXL6tu3bqaN2+efvvb37q9QQAAgOrI5RBWp04d8z6w4OBgnThxwhw7e/as+zoDAACoxly+J6x79+764osv1KZNGw0aNEhTp07VoUOHtH79enXv3r0qegQAAKh2XA5h7777roqKiiRJc+fOVVFRkdatW6eWLVvyyUgAAIBKcjmENW/e3Py6Tp06WrFihVsbAgAAeBCwWCsAAIAFKnUlrH79+rLZbJU64fnz539UQwAAAA+CSoWwRYsWmV+fO3dOb731lqKjoxUZGSlJSktL05YtW/TGG29USZMAAADVjc0wDMOVA4YMGaK+fftq/PjxTvuXLl2qrVu3KikpyZ39VSsOh0N+fn4qLCyUr6+vW8/dbOYmt57vbjk5f7DVLQAAcFtV9fPb5XvCtmzZogEDBpTbP2DAAG3dutUtTQEAAFR3Loewhg0basOGDeX2b9iwQQ0bNnRLUwAAANWdy0tUzJ07Vy+99JJ27NihiIgISVJ6erqSk5P13nvvub1BAACA6sjlEPb888+rTZs2WrJkidavXy9JatOmjb744gszlAEAAOD2XA5hkhQREaE1a9a4uxcAAIAHxg8KYWVlZTp+/Ljy8/NVVlbmNNarVy+3NAYAAFCduRzCvvrqK40YMUKnTp3Sv65uYbPZVFpa6rbmAAAAqiuXQ9jYsWPVtWtXbdq0ScHBwZVeSR8AAAD/x+UQduzYMf35z39WixYtqqIfAACAB4LL64RFRETo+PHjVdELAADAA8PlK2ETJkzQ1KlTZbfb1b59e9WsWdNpvEOHDm5rDgAAoLpyOYQNGTJEkvTiiy+a+2w2mwzD4MZ8AACASnI5hOXk5FRFHwAAAA8Ul0NY06ZNq6IPAACAB8oPWqxVkv72t78pNzdXV69eddr/1FNP/eimAAAAqjuXQ9g//vEPPfPMMzp06JB5L5gkc70w7gkDAAC4M5eXqJg0aZLCwsKUn5+v2rVr68iRI9q1a5e6du2qHTt2VEGLAAAA1Y/LV8LS0tK0bds2BQQEyMPDQx4eHnriiSeUkJCgiRMn6sCBA1XRJwAAQLXi8pWw0tJS1atXT5IUEBCg06dPS/r+hv3s7Gz3dgcAAFBNuXwlrF27dvr6668VFhamiIgILViwQF5eXlq5cqWaN29eFT0CAABUOy5fCZs9e7bKysokSfPmzVNOTo569uypzZs3a/HixS6da9euXXryyScVEhIim82mpKQkp3HDMBQfH6/g4GDVqlVLUVFROnbsmFPN+fPnNXLkSPn6+srf31+jR49WUVGRU83BgwfVs2dP+fj4KDQ0VAsWLCjXy8cff6zWrVvLx8dH7du31+bNm13uBQAAoLJcDmHR0dF69tlnJUktWrRQVlaWzp49q/z8fPXr18+lcxUXFys8PFzLli2rcHzBggVasmSJVqxYofT0dNWpU0fR0dG6cuWKWTNy5EgdOXJEKSkp2rhxo3bt2qUxY8aY4w6HQ/3791fTpk2VkZGhhQsXas6cOVq5cqVZs3v3bg0fPlyjR4/WgQMHFBMTo5iYGB0+fNilXgAAACrLZtxYY6KSXnzxRS1evNi8L+yG4uJiTZgwQX/4wx9+WCM2mz755BPFxMRI+v7KU0hIiKZOnapp06ZJkgoLCxUYGKjExEQNGzZMR48eVdu2bbV371517dpVkpScnKxBgwbp22+/VUhIiJYvX67XX39ddrtdXl5ekqSZM2cqKSlJWVlZkqShQ4equLhYGzduNPvp3r27OnbsqBUrVlSql8pwOBzy8/NTYWGhfH19f9A83UqzmZvcer675eT8wVa3AADAbVXVz2+Xr4StXr1aly9fLrf/8uXL+p//+R+3NCV9/+uR7Ha7oqKizH1+fn6KiIhQWlqapO8/qenv728GMEmKioqSh4eH0tPTzZpevXqZAUz6/mpedna2Lly4YNbc/Dw3am48T2V6qUhJSYkcDofTAwAAQHIhhDkcDhUWFsowDF28eNEpWFy4cEGbN29W48aN3daY3W6XJAUGBjrtDwwMNMfsdnu556xRo4YaNGjgVFPROW5+jlvV3Dx+p14qkpCQID8/P/MRGhp6h1cNAAAeFJX+dKS/v79sNptsNpseffTRcuM2m01z5851a3P3u1mzZikuLs7cdjgcBDEAACDJhRC2fft2GYahn/70p/rLX/6iBg0amGNeXl5q2rSpQkJC3NZYUFCQJCkvL0/BwcHm/ry8PHXs2NGsyc/Pdzru+vXrOn/+vHl8UFCQ8vLynGpubN+p5ubxO/VSEW9vb3l7e1fq9QIAgAdLpd+O7N27t/r06aOcnBzFxMSod+/e5iMyMtKtAUySwsLCFBQUpNTUVHOfw+FQenq6IiMjJUmRkZEqKChQRkaGWbNt2zaVlZUpIiLCrNm1a5euXbtm1qSkpKhVq1aqX7++WXPz89youfE8lekFAADAFS7fmH/06FF9+eWX5vayZcvUsWNHjRgxwrzRvbKKioqUmZmpzMxMSd/fAJ+Zmanc3FzZbDZNnjxZb731lj799FMdOnRIo0aNUkhIiPkJyjZt2mjAgAF6+eWXtWfPHn355ZcaP368hg0bZobCESNGyMvLS6NHj9aRI0e0bt06LV682OltwkmTJik5OVnvvPOOsrKyNGfOHO3bt0/jx4+XpEr1AgAA4AqXQ9j06dPNT/kdOnRIcXFxGjRokHJycpyCTWXs27dPnTp1UqdOnSRJcXFx6tSpk+Lj4yVJr732miZMmKAxY8aoW7duKioqUnJysnx8fMxzrFmzRq1bt1a/fv00aNAgPfHEE05rgPn5+enzzz9XTk6OunTpoqlTpyo+Pt5pLbEePXpo7dq1WrlypcLDw/XnP/9ZSUlJateunVlTmV4AAAAqy+V1wurWravDhw+rWbNmmjNnjg4fPqw///nP2r9/vwYNGnTbTws+6FgnrDzWCQMA3OvumXXCvLy8dOnSJUnS1q1b1b9/f0lSgwYNWAcLAACgklz+Bd5PPPGE4uLi9Pjjj2vPnj1at26dJOnvf/+7Hn74Ybc3CAAAUB25fCVs6dKlqlGjhv785z9r+fLleuihhyRJn332mQYMGOD2BgEAAKojl6+ENWnSxOl3LN7wu9/9zi0NAQAAPAhcvhIGAACAH48QBgAAYAFCGAAAgAUqFcIOHjyosrKyqu4FAADggVGpENapUyedPXtWktS8eXOdO3euSpsCAACo7ioVwvz9/ZWTkyNJOnnyJFfFAAAAfqRKLVExZMgQ9e7dW8HBwbLZbOratas8PT0rrP3HP/7h1gYBAACqo0qFsJUrV+rZZ5/V8ePHNXHiRL388suqV69eVfcGAABQbVV6sdYbq+FnZGRo0qRJhDAAAIAfweUV81etWmV+/e2330oSvzMSAADARS6vE1ZWVqZ58+bJz89PTZs2VdOmTeXv76/f/OY33LAPAABQSS5fCXv99df1wQcfaP78+Xr88cclSV988YXmzJmjK1eu6D/+4z/c3iQAAEB143IIW716td5//3099dRT5r4OHTrooYce0r//+78TwgAAACrB5bcjz58/r9atW5fb37p1a50/f94tTQEAAFR3Loew8PBwLV26tNz+pUuXKjw83C1NAQAAVHcuvx25YMECDR48WFu3blVkZKQkKS0tTd988402b97s9gYBAACqI5evhPXu3Vt///vf9cwzz6igoEAFBQV69tlnlZ2drZ49e1ZFjwAAANWOy1fCJCkkJIQb8AEAAH4El6+EAQAA4McjhAEAAFiAEAYAAGABl0KYYRjKzc3VlStXqqofAACAB4LLIaxFixb65ptvqqofAACAB4JLIczDw0MtW7bUuXPnqqofAACAB4LL94TNnz9f06dP1+HDh6uiHwAAgAeCy+uEjRo1SpcuXVJ4eLi8vLxUq1Ytp3F+fyQAAMCduRzCFi1aVAVtAAAAPFhcDmGxsbFV0QcAAMAD5QetE3bixAnNnj1bw4cPV35+viTps88+05EjR9zaHAAAQHXlcgjbuXOn2rdvr/T0dK1fv15FRUWSpK+//lpvvvmm2xsEAACojlwOYTNnztRbb72llJQUeXl5mft/+tOf6quvvnJrcwAAANWVyyHs0KFDeuaZZ8rtb9y4sc6ePeuWpgAAAKo7l0OYv7+/zpw5U27/gQMH9NBDD7mlKQAAgOrO5RA2bNgwzZgxQ3a7XTabTWVlZfryyy81bdo0jRo1qip6BAAAqHZcDmFvv/22WrdurdDQUBUVFalt27bq1auXevToodmzZ1dFjwAAANWOy+uEeXl56b333tMbb7yhw4cPq6ioSJ06dVLLli2roj8AAIBq6QetEyZJTZo00cCBA/XLX/6ySgNYs2bNZLPZyj3GjRsnSerTp0+5sbFjxzqdIzc3V4MHD1bt2rXVuHFjTZ8+XdevX3eq2bFjhzp37ixvb2+1aNFCiYmJ5XpZtmyZmjVrJh8fH0VERGjPnj1V9roBAED19oNC2AcffKB27drJx8dHPj4+ateund5//3139yZJ2rt3r86cOWM+UlJSJEm//OUvzZqXX37ZqWbBggXmWGlpqQYPHqyrV69q9+7dWr16tRITExUfH2/W5OTkaPDgwerbt68yMzM1efJkvfTSS9qyZYtZs27dOsXFxenNN9/U/v37FR4erujoaHOxWgAAAFfYDMMwXDkgPj5e7777riZMmKDIyEhJUlpampYuXaopU6Zo3rx5VdLoDZMnT9bGjRt17Ngx2Ww29enTRx07drzl77T87LPP9POf/1ynT59WYGCgJGnFihWaMWOGvvvuO3l5eWnGjBnatGmTDh8+bB43bNgwFRQUKDk5WZIUERGhbt26aenSpZKksrIyhYaGasKECZo5c2aFz11SUqKSkhJz2+FwKDQ0VIWFhfL19XXHdJiazdzk1vPdLSfnD7a6BQAAbsvhcMjPz8/tP79dvhK2fPlyvffee0pISNBTTz2lp556SgkJCVq5cqV+//vfu62xily9elV//OMf9eKLL8pms5n716xZo4CAALVr106zZs3SpUuXzLG0tDS1b9/eDGCSFB0dLYfDYf6apbS0NEVFRTk9V3R0tNLS0sznzcjIcKrx8PBQVFSUWVORhIQE+fn5mY/Q0NAfNwEAAKDacPnG/GvXrqlr167l9nfp0qXcfVbulpSUpIKCAj3//PPmvhEjRqhp06YKCQnRwYMHNWPGDGVnZ2v9+vWSJLvd7hTAJJnbdrv9tjUOh0OXL1/WhQsXVFpaWmFNVlbWLfudNWuW4uLizO0bV8IAAABcDmHPPfecli9frnfffddp/8qVKzVy5Ei3NVaRDz74QAMHDlRISIi5b8yYMebX7du3V3BwsPr166cTJ07okUceqdJ+7sTb21ve3t6W9gAAAO5NlQphN1/Nsdlsev/99/X555+re/fukqT09HTl5uZW6WKtp06d0tatW80rXLcSEREhSTp+/LgeeeQRBQUFlfsUY15eniQpKCjI/O+NfTfX+Pr6qlatWvL09JSnp2eFNTfOAQAA4IpK3RN24MAB83Ho0CF16dJFjRo10okTJ3TixAkFBASoc+fO5j1WVWHVqlVq3LixBg++/Y3cmZmZkqTg4GBJUmRkpA4dOuT0KcaUlBT5+vqqbdu2Zk1qaqrTeVJSUswPHnh5ealLly5ONWVlZUpNTTVrAAAAXFGpK2Hbt2+v6j5uq6ysTKtWrVJsbKxq1Pi/lk+cOKG1a9dq0KBBatiwoQ4ePKgpU6aoV69e6tChgySpf//+atu2rZ577jktWLBAdrtds2fP1rhx48y3CseOHaulS5fqtdde04svvqht27bpo48+0qZN//eJw7i4OMXGxqpr16567LHHtGjRIhUXF+uFF164u5MBAACqBZfvCbPC1q1blZubqxdffNFpv5eXl7Zu3WoGotDQUA0ZMsTp1yd5enpq48aNevXVVxUZGak6deooNjbWaSmNsLAwbdq0SVOmTNHixYv18MMP6/3331d0dLRZM3ToUH333XeKj4+X3W5Xx44dlZycXO5mfQAAgMpweZ2wK1eu6L/+67+0fft25efnq6yszGl8//79bm2wOqmqdUYk1gkDAKCqVNXPb5evhI0ePVqff/65fvGLX+ixxx5zWq8LAAAAleNyCNu4caM2b96sxx9/vCr6AQAAeCC4vGL+Qw89pHr16lVFLwAAAA8Ml0PYO++8oxkzZujUqVNV0Q8AAMADweW3I7t27aorV66oefPmql27tmrWrOk0fv78ebc1BwAAUF25HMKGDx+uf/7zn3r77bcVGBjIjfkAAAA/gMshbPfu3UpLS1N4eHhV9AMAAPBAcPmesNatW+vy5ctV0QsAAMADw+UQNn/+fE2dOlU7duzQuXPn5HA4nB4AAAC4M5ffjhwwYIAkqV+/fk77DcOQzWZTaWmpezoDAACoxlwOYVb/Mm8AAIDqwOUQ1rt376roAwAA4IHicgjbtWvXbcd79er1g5sBAAB4ULgcwvr06VNu381rhXFPGAAAwJ25/OnICxcuOD3y8/OVnJysbt266fPPP6+KHgEAAKodl6+E+fn5ldv3s5/9TF5eXoqLi1NGRoZbGgMAAKjOXL4SdiuBgYHKzs521+kAAACqNZevhB08eNBp2zAMnTlzRvPnz1fHjh3d1RcAAEC15nII69ixo2w2mwzDcNrfvXt3/eEPf3BbYwAAANWZyyEsJyfHadvDw0ONGjWSj4+P25oCAACo7lwOYU2bNq2KPgAAAB4oLocwSUpNTVVqaqry8/NVVlbmNMZbkgAAAHfmcgibO3eu5s2bp65duyo4ONhpoVYAAABUjsshbMWKFUpMTNRzzz1XFf0AAAA8EFxeJ+zq1avq0aNHVfQCAADwwHA5hL300ktau3ZtVfQCAADwwHD57cgrV65o5cqV2rp1qzp06KCaNWs6jb/77rtuaw4AAKC6+kEr5t9YGf/w4cNOY9ykDwAAUDkuh7Dt27dXRR8AAAAPFLf9Am8AAABUHiEMAADAAoQwAAAACxDCAAAALEAIAwAAsAAhDAAAwAKEMAAAAAsQwgAAACxACAMAALAAIQwAAMAC93QImzNnjmw2m9OjdevW5viVK1c0btw4NWzYUHXr1tWQIUOUl5fndI7c3FwNHjxYtWvXVuPGjTV9+nRdv37dqWbHjh3q3LmzvL291aJFCyUmJpbrZdmyZWrWrJl8fHwUERGhPXv2VMlrBgAAD4Z7OoRJ0k9+8hOdOXPGfHzxxRfm2JQpU/TXv/5VH3/8sXbu3KnTp0/r2WefNcdLS0s1ePBgXb16Vbt379bq1auVmJio+Ph4syYnJ0eDBw9W3759lZmZqcmTJ+ull17Sli1bzJp169YpLi5Ob775pvbv36/w8HBFR0crPz//7kwCAACodmyGYRhWN3Erc+bMUVJSkjIzM8uNFRYWqlGjRlq7dq1+8YtfSJKysrLUpk0bpaWlqXv37vrss8/085//XKdPn1ZgYKAkacWKFZoxY4a+++47eXl5acaMGdq0aZMOHz5snnvYsGEqKChQcnKyJCkiIkLdunXT0qVLJUllZWUKDQ3VhAkTNHPmzFv2X1JSopKSEnPb4XAoNDRUhYWF8vX1/dHzc7NmMze59Xx3y8n5g61uAQCA23I4HPLz83P7z+97/krYsWPHFBISoubNm2vkyJHKzc2VJGVkZOjatWuKiooya1u3bq0mTZooLS1NkpSWlqb27dubAUySoqOj5XA4dOTIEbPm5nPcqLlxjqtXryojI8OpxsPDQ1FRUWbNrSQkJMjPz898hIaG/oiZAAAA1ck9HcIiIiKUmJio5ORkLV++XDk5OerZs6cuXrwou90uLy8v+fv7Ox0TGBgou90uSbLb7U4B7Mb4jbHb1TgcDl2+fFlnz55VaWlphTU3znErs2bNUmFhofn45ptvXJ4DAABQPdWwuoHbGThwoPl1hw4dFBERoaZNm+qjjz5SrVq1LOyscry9veXt7W11GwAA4B50T18J+1f+/v569NFHdfz4cQUFBenq1asqKChwqsnLy1NQUJAkKSgoqNynJW9s36nG19dXtWrVUkBAgDw9PSusuXEOAAAAV91XIayoqEgnTpxQcHCwunTpopo1ayo1NdUcz87OVm5uriIjIyVJkZGROnTokNOnGFNSUuTr66u2bduaNTef40bNjXN4eXmpS5cuTjVlZWVKTU01awAAAFx1T4ewadOmaefOnTp58qR2796tZ555Rp6enho+fLj8/Pw0evRoxcXFafv27crIyNALL7ygyMhIde/eXZLUv39/tW3bVs8995y+/vprbdmyRbNnz9a4cePMtwnHjh2rf/zjH3rttdeUlZWl3//+9/roo480ZcoUs4+4uDi99957Wr16tY4ePapXX31VxcXFeuGFFyyZFwAAcP+7p+8J+/bbbzV8+HCdO3dOjRo10hNPPKGvvvpKjRo1kiT97ne/k4eHh4YMGaKSkhJFR0fr97//vXm8p6enNm7cqFdffVWRkZGqU6eOYmNjNW/ePLMmLCxMmzZt0pQpU7R48WI9/PDDev/99xUdHW3WDB06VN99953i4+Nlt9vVsWNHJScnl7tZHwAAoLLu6XXCqpuqWmdEYp0wAACqygO7ThgAAEB1RAgDAACwACEMAADAAoQwAAAACxDCAAAALEAIAwAAsAAhDAAAwAKEMAAAAAsQwgAAACxACAMAALAAIQwAAMAChDAAAAALEMIAAAAsQAgDAACwACEMAADAAoQwAAAACxDCAAAALEAIAwAAsAAhDAAAwAKEMAAAAAsQwgAAACxACAMAALAAIQwAAMAChDAAAAALEMIAAAAsQAgDAACwACEMAADAAoQwAAAACxDCAAAALEAIAwAAsAAhDAAAwAKEMAAAAAsQwgAAACxQw+oG8GBrNnOT1S247OT8wVa3AACoBrgSBgAAYAFCGAAAgAUIYQAAABYghAEAAFiAEAYAAGCBezqEJSQkqFu3bqpXr54aN26smJgYZWdnO9X06dNHNpvN6TF27FinmtzcXA0ePFi1a9dW48aNNX36dF2/ft2pZseOHercubO8vb3VokULJSYmlutn2bJlatasmXx8fBQREaE9e/a4/TUDAIAHwz0dwnbu3Klx48bpq6++UkpKiq5du6b+/furuLjYqe7ll1/WmTNnzMeCBQvMsdLSUg0ePFhXr17V7t27tXr1aiUmJio+Pt6sycnJ0eDBg9W3b19lZmZq8uTJeumll7RlyxazZt26dYqLi9Obb76p/fv3Kzw8XNHR0crPz6/6iQAAANWOzTAMw+omKuu7775T48aNtXPnTvXq1UvS91fCOnbsqEWLFlV4zGeffaaf//znOn36tAIDAyVJK1as0IwZM/Tdd9/Jy8tLM2bM0KZNm3T48GHzuGHDhqmgoEDJycmSpIiICHXr1k1Lly6VJJWVlSk0NFQTJkzQzJkzK9W/w+GQn5+fCgsL5evr+0OnoUL343pb9yvWCQOAB0tV/fy+p6+E/avCwkJJUoMGDZz2r1mzRgEBAWrXrp1mzZqlS5cumWNpaWlq3769GcAkKTo6Wg6HQ0eOHDFroqKinM4ZHR2ttLQ0SdLVq1eVkZHhVOPh4aGoqCizpiIlJSVyOBxODwAAAOk+WjG/rKxMkydP1uOPP6527dqZ+0eMGKGmTZsqJCREBw8e1IwZM5Sdna3169dLkux2u1MAk2Ru2+3229Y4HA5dvnxZFy5cUGlpaYU1WVlZt+w5ISFBc+fO/eEvGgAAVFv3TQgbN26cDh8+rC+++MJp/5gxY8yv27dvr+DgYPXr108nTpzQI488crfbdDJr1izFxcWZ2w6HQ6GhoRZ2BAAA7hX3RQgbP368Nm7cqF27dunhhx++bW1ERIQk6fjx43rkkUcUFBRU7lOMeXl5kqSgoCDzvzf23Vzj6+urWrVqydPTU56enhXW3DhHRby9veXt7V25FwkAAB4o9/Q9YYZhaPz48frkk0+0bds2hYWF3fGYzMxMSVJwcLAkKTIyUocOHXL6FGNKSop8fX3Vtm1bsyY1NdXpPCkpKYqMjJQkeXl5qUuXLk41ZWVlSk1NNWsAAABccU9fCRs3bpzWrl2rDRs2qF69euY9XH5+fqpVq5ZOnDihtWvXatCgQWrYsKEOHjyoKVOmqFevXurQoYMkqX///mrbtq2ee+45LViwQHa7XbNnz9a4cePMq1Rjx47V0qVL9dprr+nFF1/Utm3b9NFHH2nTpv/7xGFcXJxiY2PVtWtXPfbYY1q0aJGKi4v1wgsv3P2JAQAA9717OoQtX75c0vfLUNxs1apVev755+Xl5aWtW7eagSg0NFRDhgzR7NmzzVpPT09t3LhRr776qiIjI1WnTh3FxsZq3rx5Zk1YWJg2bdqkKVOmaPHixXr44Yf1/vvvKzo62qwZOnSovvvuO8XHx8tut6tjx45KTk4ud7M+AABAZdxX64Td71gnrHpgnTAAeLCwThgAAEA1QggDAACwACEMAADAAoQwAAAACxDCAAAALEAIAwAAsAAhDAAAwAKEMAAAAAsQwgAAACxACAMAALAAIQwAAMAChDAAAAALEMIAAAAsQAgDAACwACEMAADAAoQwAAAACxDCAAAALEAIAwAAsAAhDAAAwAKEMAAAAAsQwgAAACxACAMAALAAIQwAAMAChDAAAAALEMIAAAAsQAgDAACwACEMAADAAoQwAAAACxDCAAAALEAIAwAAsAAhDAAAwAI1rG4AuN80m7nJ6hZcdnL+YKtbAAD8C66EAQAAWIAQBgAAYAFCGAAAgAUIYQAAABYghAEAAFiAEAYAAGABQhgAAIAFCGEuWrZsmZo1ayYfHx9FRERoz549VrcEAADuQ4QwF6xbt05xcXF68803tX//foWHhys6Olr5+flWtwYAAO4zNsMwDKubuF9ERESoW7duWrp0qSSprKxMoaGhmjBhgmbOnHnH4x0Oh/z8/FRYWChfX1+39nY/ruIO3A6r/AO4V1TVz29+bVElXb16VRkZGZo1a5a5z8PDQ1FRUUpLS6vwmJKSEpWUlJjbhYWFkr7/w3S3spJLbj8nYKUmUz62ugWXHZ4bbXULAKrAjZ/b7r5uRQirpLNnz6q0tFSBgYFO+wMDA5WVlVXhMQkJCZo7d265/aGhoVXSIwBr+S2yugMAVenixYvy8/Nz2/kIYVVo1qxZiouLM7fLysp0/vx5NWzYUDabzS3P4XA4FBoaqm+++cbtb3Hi1ph36zD31mHurcG8W+fG3Ofm5spmsykkJMSt5yeEVVJAQIA8PT2Vl5fntD8vL09BQUEVHuPt7S1vb2+nff7+/lXSn6+vL/84LcC8W4e5tw5zbw3m3Tp+fn5VMvd8OrKSvLy81KVLF6Wmppr7ysrKlJqaqsjISAs7AwAA9yOuhLkgLi5OsbGx6tq1qx577DEtWrRIxcXFeuGFF6xuDQAA3GcIYS4YOnSovvvuO8XHx8tut6tjx45KTk4ud7P+3eTt7a0333yz3NueqFrMu3WYe+sw99Zg3q1T1XPPOmEAAAAW4J4wAAAACxDCAAAALEAIAwAAsAAhDAAAwAKEsPvYsmXL1KxZM/n4+CgiIkJ79uyxuqVqZ86cObLZbE6P1q1bm+NXrlzRuHHj1LBhQ9WtW1dDhgwpt6Av7mzXrl168sknFRISIpvNpqSkJKdxwzAUHx+v4OBg1apVS1FRUTp27JhTzfnz5zVy5Ej5+vrK399fo0ePVlFR0V18FfenO839888/X+7fwIABA5xqmHvXJSQkqFu3bqpXr54aN26smJgYZWdnO9VU5vtLbm6uBg8erNq1a6tx48aaPn26rl+/fjdfyn2nMnPfp0+fcn/vx44d61TjjrknhN2n1q1bp7i4OL355pvav3+/wsPDFR0drfz8fKtbq3Z+8pOf6MyZM+bjiy++MMemTJmiv/71r/r444+1c+dOnT59Ws8++6yF3d6fiouLFR4ermXLllU4vmDBAi1ZskQrVqxQenq66tSpo+joaF25csWsGTlypI4cOaKUlBRt3LhRu3bt0pgxY+7WS7hv3WnuJWnAgAFO/wb+9Kc/OY0z967buXOnxo0bp6+++kopKSm6du2a+vfvr+LiYrPmTt9fSktLNXjwYF29elW7d+/W6tWrlZiYqPj4eCte0n2jMnMvSS+//LLT3/sFCxaYY26bewP3pccee8wYN26cuV1aWmqEhIQYCQkJFnZV/bz55ptGeHh4hWMFBQVGzZo1jY8//tjcd/ToUUOSkZaWdpc6rH4kGZ988om5XVZWZgQFBRkLFy409xUUFBje3t7Gn/70J8MwDONvf/ubIcnYu3evWfPZZ58ZNpvN+Oc//3nXer/f/evcG4ZhxMbGGk8//fQtj2Hu3SM/P9+QZOzcudMwjMp9f9m8ebPh4eFh2O12s2b58uWGr6+vUVJScndfwH3sX+feMAyjd+/exqRJk255jLvmnith96GrV68qIyNDUVFR5j4PDw9FRUUpLS3Nws6qp2PHjikkJETNmzfXyJEjlZubK0nKyMjQtWvXnP4cWrdurSZNmvDn4EY5OTmy2+1O8+zn56eIiAhzntPS0uTv76+uXbuaNVFRUfLw8FB6evpd77m62bFjhxo3bqxWrVrp1Vdf1blz58wx5t49CgsLJUkNGjSQVLnvL2lpaWrfvr3TguHR0dFyOBw6cuTIXez+/vavc3/DmjVrFBAQoHbt2mnWrFm6dOmSOeauuWfF/PvQ2bNnVVpaWm6l/sDAQGVlZVnUVfUUERGhxMREtWrVSmfOnNHcuXPVs2dPHT58WHa7XV5eXuV+KXtgYKDsdrs1DVdDN+ayor/vN8bsdrsaN27sNF6jRg01aNCAP4sfacCAAXr22WcVFhamEydO6Ne//rUGDhyotLQ0eXp6MvduUFZWpsmTJ+vxxx9Xu3btJKlS31/sdnuF/y5ujOHOKpp7SRoxYoSaNm2qkJAQHTx4UDNmzFB2drbWr18vyX1zTwgDbmPgwIHm1x06dFBERISaNm2qjz76SLVq1bKwM+DuGDZsmPl1+/bt1aFDBz3yyCPasWOH+vXrZ2Fn1ce4ceN0+PBhp/tNcXfcau5vvqexffv2Cg4OVr9+/XTixAk98sgjbnt+3o68DwUEBMjT07Pcp2Ty8vIUFBRkUVcPBn9/fz366KM6fvy4goKCdPXqVRUUFDjV8OfgXjfm8nZ/34OCgsp9KOX69es6f/48fxZu1rx5cwUEBOj48eOSmPsfa/z48dq4caO2b9+uhx9+2Nxfme8vQUFBFf67uDGG27vV3FckIiJCkpz+3rtj7glh9yEvLy916dJFqamp5r6ysjKlpqYqMjLSws6qv6KiIp04cULBwcHq0qWLatas6fTnkJ2drdzcXP4c3CgsLExBQUFO8+xwOJSenm7Oc2RkpAoKCpSRkWHWbNu2TWVlZeY3T7jHt99+q3Pnzik4OFgSc/9DGYah8ePH65NPPtG2bdsUFhbmNF6Z7y+RkZE6dOiQUwhOSUmRr6+v2rZte3deyH3oTnNfkczMTEly+nvvlrn/AR8kwD3gww8/NLy9vY3ExETjb3/7mzFmzBjD39/f6ZMa+PGmTp1q7Nixw8jJyTG+/PJLIyoqyggICDDy8/MNwzCMsWPHGk2aNDG2bdtm7Nu3z4iMjDQiIyMt7vr+c/HiRePAgQPGgQMHDEnGu+++axw4cMA4deqUYRiGMX/+fMPf39/YsGGDcfDgQePpp582wsLCjMuXL5vnGDBggNGpUycjPT3d+OKLL4yWLVsaw4cPt+ol3TduN/cXL140pk2bZqSlpRk5OTnG1q1bjc6dOxstW7Y0rly5Yp6DuXfdq6++avj5+Rk7duwwzpw5Yz4uXbpk1tzp+8v169eNdu3aGf379zcyMzON5ORko1GjRsasWbOseEn3jTvN/fHjx4158+YZ+/btM3JycowNGzYYzZs3N3r16mWew11zTwi7j/3Xf/2X0aRJE8PLy8t47LHHjK+++srqlqqdoUOHGsHBwYaXl5fx0EMPGUOHDjWOHz9ujl++fNn493//d6N+/fpG7dq1jWeeecY4c+aMhR3fn7Zv325IKveIjY01DOP7ZSreeOMNIzAw0PD29jb69etnZGdnO53j3LlzxvDhw426desavr6+xgsvvGBcvHjRgldzf7nd3F+6dMno37+/0ahRI6NmzZpG06ZNjZdffrnc/+wx966raM4lGatWrTJrKvP95eTJk8bAgQONWrVqGQEBAcbUqVONa9eu3eVXc3+509zn5uYavXr1Mho0aGB4e3sbLVq0MKZPn24UFhY6nccdc2/7/xsCAADAXcQ9YQAAABYghAEAAFiAEAYAAGABQhgAAIAFCGEAAAAWIIQBAABYgBAGAABgAUIYAACABQhhANyqT58+mjx5stVtSJJ27Nghm81W7pcgu8OcOXMUGBgom82mpKQkt5/fnapyHqry3EB1RwgDUC3czfB39OhRzZ07V//93/+tM2fOaODAgS4df/LkSdlsNvOXAgN4MNWwugEAuN+cOHFCkvT000/LZrNZ3M3dcfXqVXl5eVndBlCtcCUMQJUqKSnRtGnT9NBDD6lOnTqKiIjQjh07zPHExET5+/try5YtatOmjerWrasBAwbozJkzZs3169c1ceJE+fv7q2HDhpoxY4ZiY2MVExMjSXr++ee1c+dOLV68WDabTTabTSdPnjSPz8jIUNeuXVW7dm316NFD2dnZt+350KFD+ulPf6patWqpYcOGGjNmjIqKiiR9/zbkk08+KUny8PC4ZQi7cOGCRo4cqUaNGqlWrVpq2bKlVq1aJUkKCwuTJHXq1Ek2m019+vSRJO3du1c/+9nPFBAQID8/P/Xu3Vv79+93Oq/NZtP777+vZ555RrVr11bLli316aefOtVs3rxZjz76qGrVqqW+ffs6zYUknTt3TsOHD9dDDz2k2rVrq3379vrTn/7kVNOnTx+NHz9ekydPVkBAgKKjoyt1bgAucM/vJAeA7/Xu3duYNGmSuf3SSy8ZPXr0MHbt2mUcP37cWLhwoeHt7W38/e9/NwzDMFatWmXUrFnTiIqKMvbu3WtkZGQYbdq0MUaMGGGe46233jIaNGhgrF+/3jh69KgxduxYw9fX13j66acNwzCMgoICIzIy0nj55ZeNM2fOGGfOnDGuX79ubN++3ZBkREREGDt27DCOHDli9OzZ0+jRo8ct+y8qKjKCg4ONZ5991jh06JCRmppqhIWFGbGxsYZhGMbFixeNVatWGZLM56rIuHHjjI4dOxp79+41cnJyjJSUFOPTTz81DMMw9uzZY0gytm7dapw5c8Y4d+6cYRiGkZqaavzv//6vcfToUeNvf/ubMXr0aCMwMNBwOBzmeSUZDz/8sLF27Vrj2LFjxsSJE426deua58jNzTW8vb2NuLg4Iysry/jjH/9oBAYGGpKMCxcuGIZhGN9++62xcOFC48CBA8aJEyeMJUuWGJ6enkZ6errTn2PdunWN6dOnG1lZWUZWVlalzg2g8ghhANzq5hB26tQpw9PT0/jnP//pVNOvXz9j1qxZhmEYZqA5fvy4Ob5s2TIjMDDQ3A4MDDQWLlxobl+/ft1o0qSJGcL+9XlvuBHCtm7dau7btGmTIcm4fPlyhf2vXLnSqF+/vlFUVOR0jIeHh2G32w3DMIxPPvnEuNP/wz755JPGCy+8UOFYTk6OIck4cODAbc9RWlpq1KtXz/jrX/9q7pNkzJ4929wuKioyJBmfffaZYRiGMWvWLKNt27ZO55kxY8Ydg9LgwYONqVOnmtu9e/c2OnXq5FTzQ88NoGLcEwagyhw6dEilpaV69NFHnfaXlJSoYcOG5nbt2rX1yCOPmNvBwcHKz8+XJBUWFiovL0+PPfaYOe7p6akuXbqorKysUn106NDB6dySlJ+fryZNmpSrPXr0qMLDw1WnTh1z3+OPP66ysjJlZ2crMDCwUs/56quvasiQIdq/f7/69++vmJgY9ejR47bH5OXlafbs2dqxY4fy8/NVWlqqS5cuKTc395avp06dOvL19TXn6+jRo4qIiHCqj4yMdNouLS3V22+/rY8++kj//Oc/dfXqVZWUlKh27dpOdV26dHHarsy5AVQeIQxAlSkqKpKnp6cyMjLk6enpNFa3bl3z65o1azqN2Ww2GYbhtj5uPv+Ne7gqG+B+qIEDB+rUqVPavHmzUlJS1K9fP40bN07/+Z//ectjYmNjde7cOS1evFhNmzaVt7e3IiMjdfXqVae6iubLldezcOFCLV68WIsWLVL79u1Vp04dTZ48udzz3BxEAbgfN+YDqDKdOnVSaWmp8vPz1aJFC6dHUFBQpc7h5+enwMBA7d2719xXWlpa7oZ1Ly8vlZaW/uie27Rpo6+//lrFxcXmvi+//FIeHh5q1aqVS+dq1KiRYmNj9cc//lGLFi3SypUrzV4llev3yy+/1MSJEzVo0CD95Cc/kbe3t86ePety/3v27HHa99VXX5V7nqefflq/+tWvFB4erubNm+vvf/+7W84NoPIIYQCqzKOPPqqRI0dq1KhRWr9+vXJycrRnzx4lJCRo06ZNlT7PhAkTlJCQoA0bNig7O1uTJk3ShQsXnD6Z2KxZM6Wnp+vkyZM6e/bsD77SNXLkSPn4+Cg2NlaHDx/W9u3bNWHCBD333HOVfitSkuLj47VhwwYdP35cR44c0caNG9WmTRtJUuPGjVWrVi0lJycrLy9PhYWFkqSWLVvqf//3f3X06FGlp6dr5MiRqlWrlkv9jx07VseOHdP06dOVnZ2ttWvXKjEx0ammZcuWSklJ0e7du3X06FG98sorysvLc8u5AVQeIQxAlVq1apVGjRqlqVOnqlWrVoqJidHevXsrvB/rVmbMmKHhw4dr1KhRioyMVN26dRUdHS0fHx+zZtq0afL09FTbtm3VqFGjcvdRVVbt2rW1ZcsWnT9/Xt26ddMvfvEL9evXT0uXLnXpPF5eXpo1a5Y6dOigXr16ydPTUx9++KEkqUaNGlqyZIn++7//WyEhIXr66aclSR988IEuXLigzp0767nnntPEiRPVuHFjl563SZMm+stf/qKkpCSFh4drxYoVevvtt51qZs+erc6dOys6Olp9+vRRUFCQudzHjz03gMqzGe688QIA7oKysjK1adNG//Zv/6bf/OY3VrcDAD8IN+YDuOedOnVKn3/+uXr37q2SkhItXbpUOTk5GjFihNWtAcAPxtuRAO55Hh4eSkxMVLdu3fT444/r0KFD2rp1q3mPFQDcj3g7EgAAwAJcCQMAALAAIQwAAMAChDAAAAALEMIAAAAsQAgDAACwACEMAADAAoQwAAAACxDCAAAALPD/AQdC59Aon0JdAAAAAElFTkSuQmCC\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "def threshold_len_max(max_len, data):\n", + " data = list(data) # 제네레이터를 리스트로 변환\n", + " sentence_count = 0\n", + " for sentence in data:\n", + " if len(sentence) <= max_len:\n", + " sentence_count += 1\n", + " return sentence_count / len(data) * 100\n", + "\n", + "def threshold_len_min(min_len, data):\n", + " data = list(data) # 제네레이터를 리스트로 변환\n", + " sentence_count = 0\n", + " for sentence in data:\n", + " if len(sentence) >= min_len:\n", + " sentence_count += 1\n", + " return sentence_count / len(data) * 100" + ], + "metadata": { + "id": "SqMQxZO4p1TQ" + }, + "execution_count": 51, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "len(filtered_data_TR['src'])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cRQ_fdfSE0Rb", + "outputId": "7abe010f-44bf-4014-d36f-b9a812542219" + }, + "execution_count": 52, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "211878" + ] + }, + "metadata": {}, + "execution_count": 52 + } + ] + }, + { + "cell_type": "code", + "source": [ + "max_len = 22\n", + "dialect_max = threshold_len_max(max_len, (sentence.split(' ') for sentence in filtered_data_TR['src']))\n", + "standard_max = threshold_len_max(max_len, (sentence.split(' ') for sentence in filtered_data_TR['tar']))\n", + "\n", + "print(f\"dialect 중 {max_len} 이하인 비율은 {dialect_max}\")\n", + "print(f\"standard 중 {max_len} 이하인 비율은 {standard_max}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ali5lQXaqSf0", + "outputId": "398406ce-6798-40b8-9942-ab1546088d0f" + }, + "execution_count": 53, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "dialect 중 22 이하인 비율은 80.23060440442141\n", + "standard 중 22 이하인 비율은 80.11355591425254\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "## 문장의 길이가 긴 것이 많아 80프로 정도의 데이터만 남김\n", + "\n", + "d_filter_indices = [i for i, sentence in enumerate(sentence.split(' ') for sentence in filtered_data_TR['src']) if len(sentence) <= max_len ]\n", + "s_filter_indices = [i for i, sentence in enumerate(sentence.split(' ') for sentence in filtered_data_TR['tar']) if len(sentence) <= max_len ]" + ], + "metadata": { + "id": "iLXOEUz2u45D" + }, + "execution_count": 54, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "indices = list(set(d_filter_indices) & set(s_filter_indices))" + ], + "metadata": { + "id": "aV630gtgwMDM" + }, + "execution_count": 55, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "len(indices)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "U2I4XBAtPd_b", + "outputId": "69928af8-0155-4376-f9ad-f74c02b7bb57" + }, + "execution_count": 56, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "169723" + ] + }, + "metadata": {}, + "execution_count": 56 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pickle\n", + "1\n", + "# pickle 파일로부터 데이터를 불러옴\n", + "with open('/content/drive/MyDrive/LSTM+attention/filtered_dialect.pkl', 'rb') as f:\n", + " filtered_dialect = pickle.load(f)\n", + "\n", + "with open('/content/drive/MyDrive/LSTM+attention/filtered_standard.pkl', 'rb') as f:\n", + " filtered_standard = pickle.load(f)\n", + "\n", + "# 불러온 데이터를 확인\n", + "print(filtered_dialect[:10])\n", + "print(filtered_standard[:10])" + ], + "metadata": { + "id": "B041nyJnISFJ", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "a1a6f21a-e62b-451d-ba83-7bc6131169a4" + }, + "execution_count": 57, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴', '장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴', '예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴', '이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네', '콩이파리는 가시가 있어가 꺼끄럽고 뻣뻣하고 묵어 보면 맛이 없어예', '여기에는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴', '여개는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴', '음식 먹으만 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니데이', '논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 무짜로 만지만 위험합니더', '딱꾹지를 멈치지도 않고 점들 하는디 이럴 때는 우예 해야 합니껴']\n", + "['여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까', '장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까', '예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까', '이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네', '콩잎은 가시가 있어서 껄끄럽고 뻣뻣하고 먹어 보면 맛이 없어요', '여기에는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까', '여기는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까', '음식 먹으면 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니다', '논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 함부로 만지면 위험합니다', '딱꾹지를 멈추지도 않고 점들 하는데 이럴 때는 어떻게 해야 합니까']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(len(filtered_dialect))\n", + "print(len(filtered_standard))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_dUEMZ8HRPow", + "outputId": "5067f019-15c7-478e-97ad-074ad2d14085" + }, + "execution_count": 58, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "169723\n", + "169723\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "plt.hist(sentenceLengths(filtered_dialect), bins=10)\n", + "plt.xlabel('length of dialect')\n", + "plt.ylabel('number of dialect')\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 449 + }, + "id": "yf8viS-nR3bN", + "outputId": "6b532d01-82f7-49b4-9926-a530de3da23d" + }, + "execution_count": 59, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "plt.hist(sentenceLengths(filtered_standard), bins=10)\n", + "plt.xlabel('length of standard')\n", + "plt.ylabel('number of standard')\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 449 + }, + "id": "2g430fC7RliO", + "outputId": "c0891822-c902-49b8-aeb0-4821ad1e608b" + }, + "execution_count": 60, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "SOS_token = 0\n", + "EOS_token = 1\n", + "UNK_token = 2\n", + "PAD_token = 3\n", + "\n", + "class Lang:\n", + " def __init__(self, name):\n", + " self.name = name\n", + " self.word2index = {\"UNK\": 2}\n", + " self.word2count = {}\n", + " self.index2word = {0: \"SOS\", 1: \"EOS\", 2: \"UNK\", 3: \"PAD\"}\n", + " self.n_words = 4 # SOS, EOS, UNK, PAD\n", + "\n", + " def addSentence(self, sentence):\n", + " for word in sentence.split(\" \"):\n", + " self.addWord(word)\n", + "\n", + " def addWord(self, word):\n", + " if word not in self.word2index:\n", + " self.word2index[word] = self.n_words\n", + " self.word2count[word] = 1\n", + " self.index2word[self.n_words] = word\n", + " self.n_words += 1\n", + " else:\n", + " self.word2count[word] += 1\n", + "\n", + " def getWordIndex(self, word):\n", + " return self.word2index.get(word, self.word2index[\"UNK\"])" + ], + "metadata": { + "id": "oMl0xGNU49XX" + }, + "execution_count": 61, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Lang 객체 생성\n", + "dialect_lang = Lang(\"Dialect\")\n", + "standard_lang = Lang(\"Standard\")\n", + "\n", + "# 문장 추가\n", + "for sentence in filtered_dialect[:10]:\n", + " dialect_lang.addSentence(sentence)\n", + "for sentence in filtered_standard[:10]:\n", + " standard_lang.addSentence(sentence)\n", + "\n", + "for sentence in filtered_df_VL['src'][:10]:\n", + " dialect_lang.addSentence(sentence)\n", + "for sentence in filtered_df_VL['tar'][:10]:\n", + " standard_lang.addSentence(sentence)\n", + "\n", + "# 문장\n", + "pairs = list(zip(filtered_dialect[:10], filtered_standard[:10]))\n", + "VL_pairs = list(zip(filtered_df_VL['src'][:10], filtered_df_VL['tar'][:10]))\n", + "\n", + "# 문장을 인덱스로 변환\n", + "def indexesFromSentence(lang, sentence):\n", + " return [lang.getWordIndex(word) for word in sentence.split(' ')]\n", + "\n", + "def tensorFromSentence(lang, sentence, max_length):\n", + " indexes = indexesFromSentence(lang, sentence)\n", + " indexes.append(EOS_token)\n", + " if len(indexes) < max_length:\n", + " indexes += [PAD_token] * (max_length - len(indexes))\n", + " elif len(indexes) > max_length:\n", + " indexes = indexes[:max_length-1] + [EOS_token]\n", + " return torch.tensor(indexes, dtype=torch.long).view(-1, 1)\n", + "\n", + "def tensorsFromPair(pair):\n", + " input_tensor = tensorFromSentence(dialect_lang, pair[0], max_len)\n", + " target_tensor = tensorFromSentence(standard_lang, pair[1], max_len)\n", + " return (input_tensor, target_tensor)" + ], + "metadata": { + "id": "VBPYjCbZ8l6k" + }, + "execution_count": 62, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "\n", + "max_len = 22\n", + "\n", + "# 검증 데이터를 인덱스로 변환\n", + "validation_input_tensors = [tensorFromSentence(dialect_lang, pair[0], max_len) for pair in VL_pairs]\n", + "validation_target_tensors = [tensorFromSentence(standard_lang, pair[1], max_len) for pair in VL_pairs]\n", + "\n", + "# 모델 정의\n", + "class EncoderRNN(nn.Module):\n", + " def __init__(self, input_size, hidden_size):\n", + " super(EncoderRNN, self).__init__()\n", + " self.hidden_size = hidden_size\n", + " self.embedding = nn.Embedding(input_size, hidden_size)\n", + " self.lstm = nn.LSTM(hidden_size, hidden_size)\n", + "\n", + " def forward(self, input, hidden):\n", + " embedded = self.embedding(input).view(1, 1, -1)\n", + " output, hidden = self.lstm(embedded, hidden)\n", + " return output, hidden\n", + "\n", + " def initHidden(self):\n", + " return (torch.zeros(1, 1, self.hidden_size),\n", + " torch.zeros(1, 1, self.hidden_size))\n", + "\n", + "class AttnDecoderRNN(nn.Module):\n", + " def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=max_len):\n", + " super(AttnDecoderRNN, self).__init__()\n", + " self.hidden_size = hidden_size\n", + " self.output_size = output_size\n", + " self.dropout_p = dropout_p\n", + " self.max_length = max_length\n", + "\n", + " self.embedding = nn.Embedding(self.output_size, self.hidden_size)\n", + " self.attn = nn.Linear(self.hidden_size * 2, self.max_length)\n", + " self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)\n", + " self.dropout = nn.Dropout(self.dropout_p)\n", + " self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)\n", + " self.out = nn.Linear(self.hidden_size, self.output_size)\n", + "\n", + " def forward(self, input, hidden, encoder_outputs):\n", + " embedded = self.embedding(input).view(1, 1, -1)\n", + " embedded = self.dropout(embedded)\n", + "\n", + " attn_weights = nn.functional.softmax(\n", + " self.attn(torch.cat((embedded[0], hidden[0][0]), 1)), dim=1)\n", + " attn_applied = torch.bmm(attn_weights.unsqueeze(0),\n", + " encoder_outputs.unsqueeze(0))\n", + "\n", + " output = torch.cat((embedded[0], attn_applied[0]), 1)\n", + " output = self.attn_combine(output).unsqueeze(0)\n", + "\n", + " output = nn.functional.relu(output)\n", + " output, hidden = self.lstm(output, hidden)\n", + "\n", + " output = nn.functional.log_softmax(self.out(output[0]), dim=1)\n", + " return output, hidden, attn_weights\n", + "\n", + " def initHidden(self):\n", + " return (torch.zeros(1, 1, self.hidden_size),\n", + " torch.zeros(1, 1, self.hidden_size))" + ], + "metadata": { + "id": "EyqODVGn87BL" + }, + "execution_count": 63, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import random\n", + "import time\n", + "import math\n", + "\n", + "def asMinutes(s):\n", + " m = math.floor(s / 60)\n", + " s -= m * 60\n", + " return f'{m}m {s:.2f}s'\n", + "\n", + "def timeSince(since, percent):\n", + " now = time.time()\n", + " s = now - since\n", + " es = s / (percent)\n", + " rs = es - s\n", + " return f'{asMinutes(s)} (- {asMinutes(rs)})'\n", + "\n", + "# 모델 훈련 함수\n", + "def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=max_len):\n", + " encoder_hidden = encoder.initHidden()\n", + "\n", + " encoder_optimizer.zero_grad()\n", + " decoder_optimizer.zero_grad()\n", + "\n", + " input_length = input_tensor.size(0)\n", + " target_length = target_tensor.size(0)\n", + "\n", + " encoder_outputs = torch.zeros(max_length, encoder.hidden_size)\n", + "\n", + " loss = 0\n", + "\n", + " for ei in range(input_length):\n", + " encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)\n", + " encoder_outputs[ei] = encoder_output[0, 0]\n", + "\n", + " decoder_input = torch.tensor([[SOS_token]])\n", + "\n", + " decoder_hidden = encoder_hidden\n", + "\n", + " for di in range(target_length):\n", + " decoder_output, decoder_hidden, decoder_attention = decoder(\n", + " decoder_input, decoder_hidden, encoder_outputs)\n", + " topv, topi = decoder_output.topk(1)\n", + " decoder_input = topi.squeeze().detach() # 다음 입력으로 사용\n", + "\n", + " loss += criterion(decoder_output, target_tensor[di])\n", + " if decoder_input.item() == EOS_token:\n", + " break\n", + "\n", + " loss.backward()\n", + "\n", + " encoder_optimizer.step()\n", + " decoder_optimizer.step()\n", + "\n", + " return loss.item() / target_length\n", + "\n", + "def evaluate(encoder, decoder, input_tensor, target_tensor, criterion, max_length=max_len):\n", + " with torch.no_grad():\n", + " encoder_hidden = encoder.initHidden()\n", + "\n", + " input_length = input_tensor.size(0)\n", + " target_length = target_tensor.size(0)\n", + "\n", + " encoder_outputs = torch.zeros(max_length, encoder.hidden_size)\n", + "\n", + " loss = 0\n", + "\n", + " for ei in range(input_length):\n", + " encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)\n", + " encoder_outputs[ei] = encoder_output[0, 0]\n", + "\n", + " decoder_input = torch.tensor([[SOS_token]])\n", + "\n", + " decoder_hidden = encoder_hidden\n", + "\n", + " for di in range(target_length):\n", + " decoder_output, decoder_hidden, decoder_attention = decoder(\n", + " decoder_input, decoder_hidden, encoder_outputs)\n", + " topv, topi = decoder_output.topk(1)\n", + " decoder_input = topi.squeeze().detach() # 다음 입력으로 사용\n", + "\n", + " loss += criterion(decoder_output, target_tensor[di])\n", + " if decoder_input.item() == EOS_token:\n", + " break\n", + "\n", + " return loss.item() / target_length\n", + "\n", + "def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):\n", + " start = time.time()\n", + " plot_losses = []\n", + " plot_val_losses = []\n", + " print_loss_total = 0 # Reset every print_every\n", + " plot_loss_total = 0 # Reset every plot_every\n", + " val_loss_total = 0\n", + " val_loss_avg = 0\n", + "\n", + " encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)\n", + " decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)\n", + " training_pairs = [tensorsFromPair(random.choice(pairs)) for _ in range(n_iters)]\n", + " criterion = nn.NLLLoss()\n", + "\n", + " for iter in range(1, n_iters + 1):\n", + " training_pair = training_pairs[iter - 1]\n", + " input_tensor = training_pair[0]\n", + " target_tensor = training_pair[1]\n", + "\n", + " loss = train(input_tensor, target_tensor, encoder,\n", + " decoder, encoder_optimizer, decoder_optimizer, criterion)\n", + " print_loss_total += loss\n", + " plot_loss_total += loss\n", + "\n", + " # 검증 데이터에 대한 손실 계산\n", + " if iter % print_every == 0:\n", + " val_loss_total = 0\n", + " for val_input, val_target in zip(validation_input_tensors, validation_target_tensors):\n", + " val_loss = evaluate(encoder, decoder, val_input, val_target, criterion)\n", + " val_loss_total += val_loss\n", + "\n", + " val_loss_avg = val_loss_total / len(validation_input_tensors)\n", + " print_loss_avg = print_loss_total / print_every\n", + " print_loss_total = 0\n", + " print(f'{timeSince(start, iter / n_iters)} ({iter} {iter / n_iters * 100:.2f}%) '\n", + " f'Train Loss: {print_loss_avg:.4f}, Val Loss: {val_loss_avg:.4f}')\n", + "\n", + " if iter % plot_every == 0:\n", + " plot_loss_avg = plot_loss_total / plot_every\n", + " plot_val_losses.append(val_loss_avg)\n", + " plot_losses.append(plot_loss_avg)\n", + " plot_loss_total = 0\n", + "\n", + " showPlot(plot_losses, plot_val_losses)\n", + "\n", + "def showPlot(train_losses, val_losses):\n", + " plt.figure()\n", + " plt.plot(train_losses, label='Training Loss')\n", + " plt.plot(val_losses, label='Validation Loss')\n", + " plt.title('Training and Validation Loss')\n", + " plt.xlabel('Iterations')\n", + " plt.ylabel('Loss')\n", + " plt.legend()\n", + " plt.show()" + ], + "metadata": { + "id": "Uaozw3dc_vdk" + }, + "execution_count": 68, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# 모델 초기화 및 훈련\n", + "hidden_size = 256\n", + "encoder = EncoderRNN(dialect_lang.n_words, hidden_size)\n", + "decoder = AttnDecoderRNN(hidden_size, standard_lang.n_words, dropout_p=0.1)\n", + "\n", + "trainIters(encoder, decoder, 1000, print_every=100, plot_every=50) # 적은 수의 iteration으로 실행" + ], + "metadata": { + "id": "JLgmcaB5UKtN", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 656 + }, + "outputId": "c12315c4-2421-4ee8-c38e-8609fd2af79f" + }, + "execution_count": 69, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0m 12.51s (- 1m 52.60s) (100 10.00%) Train Loss: 2.6009, Val Loss: 3.2177\n", + "0m 22.69s (- 1m 30.76s) (200 20.00%) Train Loss: 1.6699, Val Loss: 4.1320\n", + "0m 33.58s (- 1m 18.36s) (300 30.00%) Train Loss: 1.5512, Val Loss: 4.4176\n", + "0m 44.24s (- 1m 6.37s) (400 40.00%) Train Loss: 1.4290, Val Loss: 4.4798\n", + "0m 55.07s (- 0m 55.07s) (500 50.00%) Train Loss: 1.3086, Val Loss: 4.1685\n", + "1m 5.12s (- 0m 43.41s) (600 60.00%) Train Loss: 1.2437, Val Loss: 5.1896\n", + "1m 15.75s (- 0m 32.47s) (700 70.00%) Train Loss: 1.1878, Val Loss: 5.2546\n", + "1m 26.22s (- 0m 21.56s) (800 80.00%) Train Loss: 1.0526, Val Loss: 4.7955\n", + "1m 36.49s (- 0m 10.72s) (900 90.00%) Train Loss: 0.6964, Val Loss: 5.1462\n", + "1m 45.93s (- 0m 0.00s) (1000 100.00%) Train Loss: 0.7646, Val Loss: 5.2876\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "def saveModel(encoder, decoder, encoder_path='encoder.pth', decoder_path='decoder.pth'): ## 모델 저장\n", + " torch.save(encoder.state_dict(), encoder_path)\n", + " torch.save(decoder.state_dict(), decoder_path)" + ], + "metadata": { + "id": "_Gjpck4MUHBm" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def loadModel(encoder_path='encoder.pth', decoder_path='decoder.pth'): ## 모델 로드\n", + " encoder = EncoderRNN(dialect_lang.n_words, hidden_size)\n", + " decoder = AttnDecoderRNN(hidden_size, standard_lang.n_words, dropout_p=0.1)\n", + " encoder.load_state_dict(torch.load(encoder_path))\n", + " decoder.load_state_dict(torch.load(decoder_path))\n", + " return encoder, decoder" + ], + "metadata": { + "id": "t3WSCqgH-djn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "encoder_path = '/content/drive/MyDrive/LSTM+attention/test_encoder.pth'\n", + "decoder_path = '/content/drive/MyDrive/LSTM+attention/test_decoder.pth'" + ], + "metadata": { + "id": "TqE3502bKBs5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "saveModel(encoder, decoder, encoder_path, decoder_path)" + ], + "metadata": { + "id": "W6R4lLLE-gqu" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# 모델 평가 함수\n", + "def evaluateRandomly(encoder, decoder, n=10):\n", + " for i in range(n):\n", + " pair = random.choice(test_pairs)\n", + " print('Dialect:', pair[0])\n", + " print('Expected:', pair[1])\n", + " output_words = evaluate(encoder, decoder, pair[0])\n", + " output_sentence = ' '.join(output_words)\n", + " print('Predicted:', output_sentence)\n", + " print('')\n" + ], + "metadata": { + "id": "zYySN_5AUvbG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "## 테스트 데이터 준비 필요\n", + "test_dialect_sentences = [\"밥 무나?\", \"와 이리 더운교?\", \"이거 맛있다카이\", \"오늘 날씨 좋네예\"]\n", + "test_standard_sentences = [\"밥 먹었니?\", \"왜 이렇게 덥지?\", \"이거 맛있다고 하네\", \"오늘 날씨 좋네\"]\n", + "\n", + "test_pairs = list(zip(test_dialect_sentences, test_standard_sentences))" + ], + "metadata": { + "id": "ch8xAa69U5DA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# 저장된 모델 불러오기\n", + "encoder, decoder = loadModel(encoder_path, decoder_path)\n", + "\n", + "hidden_size = 256\n", + "encoder = EncoderRNN(dialect_lang.n_words, hidden_size)\n", + "decoder = AttnDecoderRNN(hidden_size, standard_lang.n_words, dropout_p=0.1)\n", + "\n", + "evaluateRandomly(encoder, decoder, n=len(test_pairs))" + ], + "metadata": { + "id": "THo_PKRYM4vP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "## 테스트 함수 실행\n", + "evaluateRandomly(encoder, decoder, n=len(test_pairs))" + ], + "metadata": { + "id": "JQNbhsGTVRCe" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file