diff --git a/AI/LSTM_attention2.ipynb b/AI/LSTM_attention2.ipynb index 1b7a3fc..46e3197 100644 --- a/AI/LSTM_attention2.ipynb +++ b/AI/LSTM_attention2.ipynb @@ -1 +1 @@ -{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyMqmEZhz6TVWEQMeTAdUpiJ"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ix1Jbg_xWTxg","executionInfo":{"status":"ok","timestamp":1718494161844,"user_tz":-540,"elapsed":23937,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"b27e89ee-70d9-4b93-9b09-d96eb688bf0f"},"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","source":["import pandas as pd\n","import numpy as np\n","import os\n","import json\n","import csv\n","\n","TL_sentence_path = '/content/drive/MyDrive/LSTM+attention/sentence_dataTL.csv'\n","VL_sentence_path = '/content/drive/MyDrive/LSTM+attention/sentence_dataVL.csv'\n","\n","# data파일 불러오기\n","TL_sentence_data = pd.read_csv(TL_sentence_path, encoding='utf-8')\n","VL_sentence_data = pd.read_csv(VL_sentence_path, encoding='utf-8')\n","\n","# 중복 제거, Pronuncication 열은 필요 없다고 생각\n","TL_sentence_data.drop('Pronunciation', axis=1, inplace=True)\n","TL_sentence_data = TL_sentence_data.drop_duplicates().reset_index(drop=True)\n","VL_sentence_data.drop('Pronunciation', axis=1, inplace=True)\n","VL_sentence_data = VL_sentence_data.drop_duplicates().reset_index(drop=True)"],"metadata":{"id":"xPCQBU1BWfcw","executionInfo":{"status":"ok","timestamp":1718494169225,"user_tz":-540,"elapsed":7382,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["TL_sentence_data[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"UZGjs0aPXrCe","executionInfo":{"status":"ok","timestamp":1718494169226,"user_tz":-540,"elapsed":4,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"d8d38195-d138-43da-ab0a-6ba293dc58ca"},"execution_count":3,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Dialect \\\n","0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴 \n","1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴 \n","2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴 \n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까 \n","4 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네 \n","\n"," Standard \n","0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까 \n","2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까 \n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까 \n","4 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네 "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
DialectStandard
0여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까
1장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까
2예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까
3음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까
4이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"TL_sentence_data[:5]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Dialect\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\uc7a5\\ub840 \\uad00\\ub828\\ud574\\uc11c \\ucd08\\uc0c1\\uc9d1 \\uac19\\uc740 \\ub370 \\uac00\\uac00 \\ud558\\uc9c0 \\ub9d0\\uc544\\uc57c \\ub370\\ub294 \\uae30 \\uc788\\uc2b5\\ub2c8\\uaef4\",\n \"\\uc774 \\uad6c\\ub450 \\ud558\\ub098\\ub9cc \\uacc4\\uc18d \\uc2e0\\uace0 \\ub315\\uae30\\uc774\\uaebc\\ub124 \\uc778\\uc790 \\uad7d\\uc774 \\ub9ce\\uc774 \\ub2f3\\uc544\\uc11c \\uac08\\uc544\\uc57c \\ub418\\uaca0\\ub124\",\n \"\\uc608\\uc804\\uc5d0\\ub294 \\uc9d1 \\uc548\\uc5d0\\uc11c \\uc5ec\\uc790\\ub4e4\\uc774 \\ub0a8\\uc790 \\uc704\\ub85c \\ub760\\ub118\\uc73c\\uba74 \\uc548 \\ub374\\ub2e4 \\ucea4\\uc2b5\\ub2c8\\uaef4\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Standard\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\uc7a5\\ub840 \\uad00\\ub828\\ud574\\uc11c \\ucd08\\uc0c1\\uc9d1 \\uac19\\uc740 \\ub370 \\uac00\\uc11c \\ud558\\uc9c0 \\ub9d0\\uc544\\uc57c \\ub370\\ub294 \\uac8c \\uc788\\uc2b5\\ub2c8\\uae4c\",\n \"\\uc774 \\uad6c\\ub450 \\ud558\\ub098\\ub9cc \\uacc4\\uc18d \\uc2e0\\uace0 \\ub2e4\\ub2c8\\ub2c8\\uae4c \\uc774\\uc81c \\uad7d\\uc774 \\ub9ce\\uc774 \\ub2f3\\uc544\\uc11c \\uac08\\uc544\\uc57c \\ub418\\uaca0\\ub124\",\n \"\\uc608\\uc804\\uc5d0\\ub294 \\uc9d1 \\uc548\\uc5d0\\uc11c \\uc5ec\\uc790\\ub4e4\\uc774 \\ub0a8\\uc790 \\uc704\\ub85c \\ub6f0\\uc5b4\\ub118\\uc73c\\uba74 \\uc548 \\ub41c\\ub2e4 \\ud588\\uc2b5\\ub2c8\\uae4c\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":3}]},{"cell_type":"code","source":["VL_sentence_data[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"a0cWFdpxDKN7","executionInfo":{"status":"ok","timestamp":1718494169226,"user_tz":-540,"elapsed":3,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"1f81d833-f41a-44eb-b832-18fb05072e3d"},"execution_count":4,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" Dialect \\\n","0 오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요 \n","1 혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다 \n","2 집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼 \n","3 아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이 \n","4 옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로 \n","\n"," Standard \n","0 오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요 \n","1 혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다 \n","2 집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼 \n","3 아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐 \n","4 옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠 "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
DialectStandard
0오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요
1혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다
2집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼
3아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐
4옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"VL_sentence_data[:5]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Dialect\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\ud608\\uc555\\uc57d\\uc740 \\uc2dc\\uac04\\uc744 \\ub9de\\ucdb0 \\ucc59\\uaca8 \\ub4dc\\uc154\\uc57c\\uc9c0 \\uc548 \\uadf8\\ub7ec\\uba74 \\ud6a8\\uacfc\\uac00 \\uc5c6\\uc2b5\\ub2c8\\ub2e4\",\n \"\\uc61b\\ub0a0\\ubd80\\ud130 \\uc870\\uc0c1\\uafc8\\uc774\\ub098 \\ub3fc\\uc9c0\\uafc8 \\uafb8\\ub9cc \\uc9d1\\uc5d0 \\ub3c8 \\ub9ce\\uc774 \\ub4e4\\uc5b4\\uc628\\ub2e4\\uace0 \\uc88b\\uc544 \\ud574\\uc9c0\\ub85c\",\n \"\\uc9d1\\uc5d0 \\ub3cc\\uc544\\uc640 \\ubcf4\\uc774\\uaebc\\ub124 \\ubb38\\uc774 \\uc5f4\\ub824 \\uc788\\uace0 \\ubf08\\ub2e4\\uc9c0\\uac00 \\uc5f4\\uc5b4\\ub454 \\ub3c8 \\uc804\\ubd80 \\uc5c6\\uc5b4\\uc9c0\\ub358 \\uc5b4\\uc774\\ub5bc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Standard\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\ud608\\uc555\\uc57d\\uc740 \\uc2dc\\uac04\\uc744 \\ub9de\\ucdb0 \\ucc59\\uaca8 \\ub4dc\\uc154\\uc57c\\uc9c0 \\uc548 \\uadf8\\ub7ec\\uba74 \\ud6a8\\uacfc\\uac00 \\uc5c6\\uc2b5\\ub2c8\\ub2e4\",\n \"\\uc61b\\ub0a0\\ubd80\\ud130 \\uc870\\uc0c1\\uafc8\\uc774\\ub098 \\ub3fc\\uc9c0\\uafc8 \\uafb8\\uba74 \\uc9d1\\uc5d0 \\ub3c8 \\ub9ce\\uc774 \\ub4e4\\uc5b4\\uc628\\ub2e4\\uace0 \\uc88b\\uc544 \\ud588\\uc8e0\",\n \"\\uc9d1\\uc5d0 \\ub3cc\\uc544\\uc640 \\ubcf4\\ub2c8\\uae4c \\ubb38\\uc774 \\uc5f4\\ub824 \\uc788\\uace0 \\uc11c\\ub78d\\uc774 \\uc5f4\\uc5b4\\ub454 \\ub3c8 \\uc804\\ubd80 \\uc5c6\\uc5b4\\uc9c0\\ub358 \\uc5b4\\uc774\\ub5bc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":4}]},{"cell_type":"code","source":["standard_sentences_TL = TL_sentence_data['Standard']\n","dialect_sentences_TL = TL_sentence_data['Dialect']\n","standard_sentences_VL = VL_sentence_data['Standard']\n","dialect_sentences_VL = VL_sentence_data['Dialect']"],"metadata":{"id":"jlxCy4d3WyDB","executionInfo":{"status":"ok","timestamp":1718494174031,"user_tz":-540,"elapsed":388,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":5,"outputs":[]},{"cell_type":"code","source":["standard_sentences_TL[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"aj-awCcGFReV","executionInfo":{"status":"ok","timestamp":1718494179071,"user_tz":-540,"elapsed":414,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"196a2ae6-1070-4a64-d754-44e01bfacdb3"},"execution_count":6,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까\n","1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까\n","2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까\n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까\n","4 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네\n","Name: Standard, dtype: object"]},"metadata":{},"execution_count":6}]},{"cell_type":"code","source":["dialect_sentences_TL[:5]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ctrb-c6hFWAX","executionInfo":{"status":"ok","timestamp":1718494179608,"user_tz":-540,"elapsed":1,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"adab1ec3-3891-4521-bc83-5e237fdda4bc"},"execution_count":7,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴\n","1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴\n","2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴\n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까\n","4 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네\n","Name: Dialect, dtype: object"]},"metadata":{},"execution_count":7}]},{"cell_type":"code","source":["# 학습 데이터 중에서 겹치는 표준어 문장과 방언 문장 제거\n","filtered_data_TR = {\n"," \"src\": [],\n"," \"tar\": []\n","}\n","\n","for i in range(0, len(dialect_sentences_TL)):\n"," if (standard_sentences_TL[i] != dialect_sentences_TL[i]):\n"," filtered_data_TR[\"src\"].append(dialect_sentences_TL[i])\n"," filtered_data_TR[\"tar\"].append(standard_sentences_TL[i])\n","\n","filtered_df_TR = pd.DataFrame(filtered_data_TR)\n","\n","filtered_df_TR[:10]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":380},"id":"GEIz3cMTXc76","executionInfo":{"status":"ok","timestamp":1718494186367,"user_tz":-540,"elapsed":5480,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"b5a0ae75-e341-471e-eec7-a438080b1c4b"},"execution_count":8,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" src \\\n","0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴 \n","1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴 \n","2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴 \n","3 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네 \n","4 콩이파리는 가시가 있어가 꺼끄럽고 뻣뻣하고 묵어 보면 맛이 없어예 \n","5 여기에는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴 \n","6 여개는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴 \n","7 음식 먹으만 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니데이 \n","8 논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 무짜로 만지만 위험합니더 \n","9 딱꾹지를 멈치지도 않고 점들 하는디 이럴 때는 우예 해야 합니껴 \n","\n"," tar \n","0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까 \n","2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까 \n","3 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네 \n","4 콩잎은 가시가 있어서 껄끄럽고 뻣뻣하고 먹어 보면 맛이 없어요 \n","5 여기에는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","6 여기는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","7 음식 먹으면 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니다 \n","8 논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 함부로 만지면 위험합니다 \n","9 딱꾹지를 멈추지도 않고 점들 하는데 이럴 때는 어떻게 해야 합니까 "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
srctar
0여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까
1장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까
2예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까
3이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네
4콩이파리는 가시가 있어가 꺼끄럽고 뻣뻣하고 묵어 보면 맛이 없어예콩잎은 가시가 있어서 껄끄럽고 뻣뻣하고 먹어 보면 맛이 없어요
5여기에는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴여기에는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까
6여개는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴여기는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까
7음식 먹으만 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니데이음식 먹으면 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니다
8논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 무짜로 만지만 위험합니더논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 함부로 만지면 위험합니다
9딱꾹지를 멈치지도 않고 점들 하는디 이럴 때는 우예 해야 합니껴딱꾹지를 멈추지도 않고 점들 하는데 이럴 때는 어떻게 해야 합니까
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"filtered_df_TR[:10]\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"src\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"\\ub17c\\ub450\\ub801\\uc5d0 \\uc804\\uc120\\uc774 \\ub298\\uc5b4\\uc838 \\uc788\\uac70\\ub098 \\uc815\\uc804\\uc774 \\ub410\\uc744 \\ub54c \\ub450\\uaebc\\ube44 \\uc9d1\\uc744 \\ubb34\\uc9dc\\ub85c \\ub9cc\\uc9c0\\ub9cc \\uc704\\ud5d8\\ud569\\ub2c8\\ub354\",\n \"\\uc7a5\\ub840 \\uad00\\ub828\\ud574\\uc11c \\ucd08\\uc0c1\\uc9d1 \\uac19\\uc740 \\ub370 \\uac00\\uac00 \\ud558\\uc9c0 \\ub9d0\\uc544\\uc57c \\ub370\\ub294 \\uae30 \\uc788\\uc2b5\\ub2c8\\uaef4\",\n \"\\uc5ec\\uae30\\uc5d0\\ub294 \\uc625\\uc218\\uac31\\uc774\\uac00 \\uc798 \\ub41c\\ub2e4 \\uce74\\ub358\\ub514 \\uc625\\uc218\\uac31\\uc774 \\ub9d0\\uace0\\ub294 \\ubb34\\uc2e0 \\ub18d\\uc0ac\\ub97c \\ub9c8\\uc774 \\uc9d3\\uc2b5\\ub2c8\\uaef4\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tar\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"\\ub17c\\ub450\\ub801\\uc5d0 \\uc804\\uc120\\uc774 \\ub298\\uc5b4\\uc838 \\uc788\\uac70\\ub098 \\uc815\\uc804\\uc774 \\ub410\\uc744 \\ub54c \\ub450\\uaebc\\ube44 \\uc9d1\\uc744 \\ud568\\ubd80\\ub85c \\ub9cc\\uc9c0\\uba74 \\uc704\\ud5d8\\ud569\\ub2c8\\ub2e4\",\n \"\\uc7a5\\ub840 \\uad00\\ub828\\ud574\\uc11c \\ucd08\\uc0c1\\uc9d1 \\uac19\\uc740 \\ub370 \\uac00\\uc11c \\ud558\\uc9c0 \\ub9d0\\uc544\\uc57c \\ub370\\ub294 \\uac8c \\uc788\\uc2b5\\ub2c8\\uae4c\",\n \"\\uc5ec\\uae30\\uc5d0\\ub294 \\uc625\\uc218\\uc218\\uac00 \\uc798 \\ub41c\\ub2e4 \\ud558\\ub358\\ub370 \\uc625\\uc218\\uc218 \\ub9d0\\uace0\\ub294 \\ubb34\\uc2a8 \\ub18d\\uc0ac\\ub97c \\ub9ce\\uc774 \\uc9d3\\uc2b5\\ub2c8\\uae4c\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":8}]},{"cell_type":"code","source":["# 검증 데이터 중에서 겹치는 표준어 문장과 방언 문장 제거\n","filtered_data_VL = {\n"," \"src\": [],\n"," \"tar\": []\n","}\n","\n","for i in range(0, len(dialect_sentences_VL)):\n"," if (standard_sentences_VL[i] != dialect_sentences_VL[i]):\n"," filtered_data_VL[\"src\"].append(dialect_sentences_VL[i])\n"," filtered_data_VL[\"tar\"].append(standard_sentences_VL[i])\n","\n","filtered_df_VL = pd.DataFrame(filtered_data_VL)\n","\n","filtered_df_VL[:10]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":398},"id":"kv006ObsFwYF","executionInfo":{"status":"ok","timestamp":1718494187317,"user_tz":-540,"elapsed":957,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"96b6c975-e694-42fd-d6f5-06908c47094a"},"execution_count":9,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" src \\\n","0 오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요 \n","1 집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼 \n","2 아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이 \n","3 옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로 \n","4 게얼에 먹을 채소나 과일 같은 것은 어데 보관을 했습니꺼 \n","5 촌구숙이라 젊은 사람들은 함부레 없고 전부 노인들만 있으이꺼네 농사 짓기가 힘들어요 \n","6 촌구석이라 젊은 사람들은 한 번이 없고 전부 노인들만 있으니까네 농사 짓기가 힘들어요 \n","7 소도 사람맨치로 잘 먹어야 근육도 붙고 심도 생겨서 일을 잘 하지로 \n","8 소도 사람 맨치로 잘 먹어야 근육도 붇고 힘도 생겨서 일을 잘 하지요 \n","9 옷가심을 짜를 때는 미리 선을 끟어 놓아야 쪽바리 잘 자를 수 있어예 \n","\n"," tar \n","0 오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요 \n","1 집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼 \n","2 아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐 \n","3 옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠 \n","4 겨울에 먹을 채소나 과일 같은 것은 어디에 보관을 했습니까 \n","5 촌구석이라 젊은 사람들은 아예 없고 전부 노인들만 있으니까 농사 짓기가 힘들어요 \n","6 촌구석이라 젊은 사람들은 한 번이 없고 전부 노인들만 있으니까 농사 짓기가 힘들어요 \n","7 소도 사람처럼 잘 먹어야 근육도 붙고 힘도 생겨서 일을 잘 하지요 \n","8 소도 사람 처럼 잘 먹어야 근육도 붇고 힘도 생겨서 일을 잘 하지요 \n","9 옷감을 자를 때는 미리 선을 그어 놓아야 똑바로 잘 자를 수 있어요 "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
srctar
0오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요
1집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼
2아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐
3옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠
4게얼에 먹을 채소나 과일 같은 것은 어데 보관을 했습니꺼겨울에 먹을 채소나 과일 같은 것은 어디에 보관을 했습니까
5촌구숙이라 젊은 사람들은 함부레 없고 전부 노인들만 있으이꺼네 농사 짓기가 힘들어요촌구석이라 젊은 사람들은 아예 없고 전부 노인들만 있으니까 농사 짓기가 힘들어요
6촌구석이라 젊은 사람들은 한 번이 없고 전부 노인들만 있으니까네 농사 짓기가 힘들어요촌구석이라 젊은 사람들은 한 번이 없고 전부 노인들만 있으니까 농사 짓기가 힘들어요
7소도 사람맨치로 잘 먹어야 근육도 붙고 심도 생겨서 일을 잘 하지로소도 사람처럼 잘 먹어야 근육도 붙고 힘도 생겨서 일을 잘 하지요
8소도 사람 맨치로 잘 먹어야 근육도 붇고 힘도 생겨서 일을 잘 하지요소도 사람 처럼 잘 먹어야 근육도 붇고 힘도 생겨서 일을 잘 하지요
9옷가심을 짜를 때는 미리 선을 끟어 놓아야 쪽바리 잘 자를 수 있어예옷감을 자를 때는 미리 선을 그어 놓아야 똑바로 잘 자를 수 있어요
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"filtered_df_VL[:10]\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"src\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"\\uc18c\\ub3c4 \\uc0ac\\ub78c \\ub9e8\\uce58\\ub85c \\uc798 \\uba39\\uc5b4\\uc57c \\uadfc\\uc721\\ub3c4 \\ubd87\\uace0 \\ud798\\ub3c4 \\uc0dd\\uaca8\\uc11c \\uc77c\\uc744 \\uc798 \\ud558\\uc9c0\\uc694\",\n \"\\uc9d1\\uc5d0 \\ub3cc\\uc544\\uc640 \\ubcf4\\uc774\\uaebc\\ub124 \\ubb38\\uc774 \\uc5f4\\ub824 \\uc788\\uace0 \\ubf08\\ub2e4\\uc9c0\\uac00 \\uc5f4\\uc5b4\\ub454 \\ub3c8 \\uc804\\ubd80 \\uc5c6\\uc5b4\\uc9c0\\ub358 \\uc5b4\\uc774\\ub5bc\",\n \"\\ucd0c\\uad6c\\uc219\\uc774\\ub77c \\uc80a\\uc740 \\uc0ac\\ub78c\\ub4e4\\uc740 \\ud568\\ubd80\\ub808 \\uc5c6\\uace0 \\uc804\\ubd80 \\ub178\\uc778\\ub4e4\\ub9cc \\uc788\\uc73c\\uc774\\uaebc\\ub124 \\ub18d\\uc0ac \\uc9d3\\uae30\\uac00 \\ud798\\ub4e4\\uc5b4\\uc694\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tar\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"\\uc18c\\ub3c4 \\uc0ac\\ub78c \\ucc98\\ub7fc \\uc798 \\uba39\\uc5b4\\uc57c \\uadfc\\uc721\\ub3c4 \\ubd87\\uace0 \\ud798\\ub3c4 \\uc0dd\\uaca8\\uc11c \\uc77c\\uc744 \\uc798 \\ud558\\uc9c0\\uc694\",\n \"\\uc9d1\\uc5d0 \\ub3cc\\uc544\\uc640 \\ubcf4\\ub2c8\\uae4c \\ubb38\\uc774 \\uc5f4\\ub824 \\uc788\\uace0 \\uc11c\\ub78d\\uc774 \\uc5f4\\uc5b4\\ub454 \\ub3c8 \\uc804\\ubd80 \\uc5c6\\uc5b4\\uc9c0\\ub358 \\uc5b4\\uc774\\ub5bc\",\n \"\\ucd0c\\uad6c\\uc11d\\uc774\\ub77c \\uc80a\\uc740 \\uc0ac\\ub78c\\ub4e4\\uc740 \\uc544\\uc608 \\uc5c6\\uace0 \\uc804\\ubd80 \\ub178\\uc778\\ub4e4\\ub9cc \\uc788\\uc73c\\ub2c8\\uae4c \\ub18d\\uc0ac \\uc9d3\\uae30\\uac00 \\ud798\\ub4e4\\uc5b4\\uc694\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":9}]},{"cell_type":"code","source":["import matplotlib\n","import matplotlib.pyplot as plt\n","\n","# 문장 길이 계산\n","def sentenceLengths(sentences):\n"," return [len(sentence.split(' ')) for sentence in sentences]"],"metadata":{"id":"OFCJmuqdOo6m","executionInfo":{"status":"ok","timestamp":1718494194081,"user_tz":-540,"elapsed":380,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":10,"outputs":[]},{"cell_type":"code","source":["plt.hist(sentenceLengths(filtered_data_TR['src']), bins=10)\n","plt.xlabel('length of dialect')\n","plt.ylabel('number of dialect')\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"id":"-zqR5FSPpN3X","executionInfo":{"status":"ok","timestamp":1718494196036,"user_tz":-540,"elapsed":1573,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"fbc12c4c-ebd7-4f52-fbf9-b41a304db22e"},"execution_count":11,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["plt.hist(sentenceLengths(filtered_data_TR['tar']), bins=10)\n","plt.xlabel('length of standard')\n","plt.ylabel('number of standard')\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"id":"wET-0eUhp2Vv","executionInfo":{"status":"ok","timestamp":1718494199237,"user_tz":-540,"elapsed":1531,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"f1ac0792-d2d2-4adc-c037-67d6dadac00c"},"execution_count":12,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["def threshold_len_max(max_len, data):\n"," data = list(data) # 제네레이터를 리스트로 변환\n"," sentence_count = 0\n"," for sentence in data:\n"," if len(sentence) <= max_len:\n"," sentence_count += 1\n"," return sentence_count / len(data) * 100\n","\n","def threshold_len_min(min_len, data):\n"," data = list(data) # 제네레이터를 리스트로 변환\n"," sentence_count = 0\n"," for sentence in data:\n"," if len(sentence) >= min_len:\n"," sentence_count += 1\n"," return sentence_count / len(data) * 100"],"metadata":{"id":"SqMQxZO4p1TQ","executionInfo":{"status":"ok","timestamp":1718494200604,"user_tz":-540,"elapsed":1,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":13,"outputs":[]},{"cell_type":"code","source":["len(filtered_data_TR['src'])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cRQ_fdfSE0Rb","executionInfo":{"status":"ok","timestamp":1718494281610,"user_tz":-540,"elapsed":388,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"70a706b3-c7c8-4a40-d115-d574ca9fd639"},"execution_count":16,"outputs":[{"output_type":"execute_result","data":{"text/plain":["211878"]},"metadata":{},"execution_count":16}]},{"cell_type":"code","source":["max_len = 22\n","dialect_max = threshold_len_max(max_len, (sentence.split(' ') for sentence in filtered_data_TR['src']))\n","standard_max = threshold_len_max(max_len, (sentence.split(' ') for sentence in filtered_data_TR['tar']))\n","\n","print(f\"dialect 중 {max_len} 이하인 비율은 {dialect_max}\")\n","print(f\"standard 중 {max_len} 이하인 비율은 {standard_max}\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Ali5lQXaqSf0","executionInfo":{"status":"ok","timestamp":1718494246363,"user_tz":-540,"elapsed":1880,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"8f0fa62d-a26f-435c-ab7a-5833ee33c8ef"},"execution_count":15,"outputs":[{"output_type":"stream","name":"stdout","text":["dialect 중 22 이하인 비율은 80.23060440442141\n","standard 중 22 이하인 비율은 80.11355591425254\n"]}]},{"cell_type":"code","source":["## 문장의 길이가 긴 것이 많아 80프로 정도의 데이터만 남김\n","\n","d_filter_indices = [i for i, sentence in enumerate(sentence.split(' ') for sentence in filtered_data_TR['src']) if len(sentence) <= max_len ]\n","s_filter_indices = [i for i, sentence in enumerate(sentence.split(' ') for sentence in filtered_data_TR['tar']) if len(sentence) <= max_len ]"],"metadata":{"id":"iLXOEUz2u45D","executionInfo":{"status":"ok","timestamp":1718494286246,"user_tz":-540,"elapsed":961,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":17,"outputs":[]},{"cell_type":"code","source":["indices = list(set(d_filter_indices) & set(s_filter_indices))"],"metadata":{"id":"aV630gtgwMDM","executionInfo":{"status":"ok","timestamp":1718494288909,"user_tz":-540,"elapsed":438,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":18,"outputs":[]},{"cell_type":"code","source":["len(indices)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"U2I4XBAtPd_b","executionInfo":{"status":"ok","timestamp":1718494311539,"user_tz":-540,"elapsed":360,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"68bf43ce-cc03-4fb3-a21a-56907b430c3e"},"execution_count":20,"outputs":[{"output_type":"execute_result","data":{"text/plain":["169723"]},"metadata":{},"execution_count":20}]},{"cell_type":"code","source":["import tqdm\n","\n","filtered_dialect = []\n","filtered_standard = []\n","\n","for i in tqdm.tqdm(range(len(filtered_data_TR['src']))):\n"," if i in indices:\n"," filtered_dialect.append(filtered_data_TR['src'][i])\n"," filtered_standard.append(filtered_data_TR['tar'][i])"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"sgHzuIzqEtlY","executionInfo":{"status":"ok","timestamp":1718494654867,"user_tz":-540,"elapsed":274354,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"a8e285ed-7213-4584-e1aa-f373ea9b6d50"},"execution_count":22,"outputs":[{"output_type":"stream","name":"stderr","text":["100%|██████████| 211878/211878 [04:34<00:00, 772.53it/s]\n"]}]},{"cell_type":"code","source":["print(len(filtered_dialect))\n","print(len(filtered_standard))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"_dUEMZ8HRPow","executionInfo":{"status":"ok","timestamp":1718494783794,"user_tz":-540,"elapsed":377,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"6a3257cd-787f-41bd-f3c4-4dd83a05bb9d"},"execution_count":24,"outputs":[{"output_type":"stream","name":"stdout","text":["169723\n","169723\n"]}]},{"cell_type":"code","source":["plt.hist(sentenceLengths(filtered_dialect), bins=10)\n","plt.xlabel('length of dialect')\n","plt.ylabel('number of dialect')\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"id":"yf8viS-nR3bN","executionInfo":{"status":"ok","timestamp":1718494937039,"user_tz":-540,"elapsed":1812,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"653185d4-113e-4fcd-89fb-cb995c7f2676"},"execution_count":26,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"iVBORw0KGgoAAAANSUhEUgAAAk0AAAGwCAYAAAC0HlECAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAzh0lEQVR4nO3de1iUdf7/8deAcvAwgydAE8WyUtbEBMXJX0dZscgydVfLb5KHunLRVLLUMi07aHbS0rLDrvTdKzu4W1aSmJLiZqSGWdomHVbFUsBDQJCCMvfvj5b5OmH6GRuaQZ+P65rrYj73e+55z0yTr+u+P/dnbJZlWQIAAMBJBfm7AQAAgIaA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCgkb8bOFO4XC7t3btXzZs3l81m83c7AADAgGVZ+vHHH9WuXTsFBZ38WBKhyUf27t2rmJgYf7cBAABOw549e9S+ffuT1hCafKR58+aSfn7T7Xa7n7sBAAAmysvLFRMT4/53/GQITT5Se0rObrcTmgAAaGBMptYwERwAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMBAI383AODsEzsty98teG3X3FR/twDAzzjSBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYCBgQtPcuXNls9k0adIk99iRI0eUnp6uVq1aqVmzZhoyZIiKi4s9HldYWKjU1FQ1adJEkZGRuuuuu3Ts2DGPmnXr1qlnz54KDQ1V586dlZmZWef5Fy1apNjYWIWFhSkpKUmbNm2qj5cJAAAaqIAITZs3b9bzzz+v7t27e4xPnjxZ7777rpYtW6bc3Fzt3btXgwcPdm+vqalRamqqqqur9dFHH+nll19WZmamZs6c6a7ZuXOnUlNTdeWVV2rr1q2aNGmSxo4dq1WrVrlrXn/9dWVkZGjWrFnasmWL4uPjlZKSopKSkvp/8QAAoEGwWZZl+bOBiooK9ezZU88++6weeugh9ejRQ/Pnz1dZWZnatGmjpUuXaujQoZKkHTt2qGvXrsrLy1OfPn20cuVKXXvttdq7d6+ioqIkSYsXL9bUqVO1f/9+hYSEaOrUqcrKytL27dvdzzl8+HCVlpYqOztbkpSUlKRevXpp4cKFkiSXy6WYmBhNmDBB06ZNM3od5eXlcjgcKisrk91u9+VbBJxxYqdl+bsFr+2am+rvFgDUA2/+/fb7kab09HSlpqYqOTnZYzw/P19Hjx71GO/SpYs6dOigvLw8SVJeXp4uuugid2CSpJSUFJWXl+uLL75w1/xy3ykpKe59VFdXKz8/36MmKChIycnJ7poTqaqqUnl5uccNAACcuRr588lfe+01bdmyRZs3b66zraioSCEhIYqIiPAYj4qKUlFRkbvm+MBUu71228lqysvLdfjwYf3www+qqak5Yc2OHTt+tfc5c+bogQceMHuhAACgwfPbkaY9e/Zo4sSJeuWVVxQWFuavNk7b9OnTVVZW5r7t2bPH3y0BAIB65LfQlJ+fr5KSEvXs2VONGjVSo0aNlJubq6efflqNGjVSVFSUqqurVVpa6vG44uJiRUdHS5Kio6PrXE1Xe/9UNXa7XeHh4WrdurWCg4NPWFO7jxMJDQ2V3W73uAEAgDOX30JTv379tG3bNm3dutV9S0xM1IgRI9x/N27cWDk5Oe7HFBQUqLCwUE6nU5LkdDq1bds2j6vcVq9eLbvdrri4OHfN8fuorandR0hIiBISEjxqXC6XcnJy3DUAAAB+m9PUvHlzdevWzWOsadOmatWqlXt8zJgxysjIUMuWLWW32zVhwgQ5nU716dNHktS/f3/FxcXp5ptv1rx581RUVKQZM2YoPT1doaGhkqTbb79dCxcu1N13363Ro0frgw8+0BtvvKGsrP+7eicjI0NpaWlKTExU7969NX/+fFVWVmrUqFG/07sBAAACnV8ngp/KU089paCgIA0ZMkRVVVVKSUnRs88+694eHBysFStWaNy4cXI6nWratKnS0tI0e/Zsd02nTp2UlZWlyZMna8GCBWrfvr1eeuklpaSkuGuGDRum/fv3a+bMmSoqKlKPHj2UnZ1dZ3I4AAA4e/l9naYzBes0AeZYpwlAoGhQ6zQBAAA0BIQmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4383QAANASx07L83YLXds1N9XcLwBmFI00AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGCE0AAAAGvA5N5557rg4ePFhnvLS0VOeee65PmgIAAAg0XoemXbt2qaamps54VVWVvv/+e580BQAAEGgamRa+88477r9XrVolh8Phvl9TU6OcnBzFxsb6tDkAAIBAYRyaBg0aJEmy2WxKS0vz2Na4cWPFxsbqiSee8GlzAAAAgcI4NLlcLklSp06dtHnzZrVu3bremgIAAAg0xqGp1s6dO+ujDwAAgIDm9UTwO+64Q08//XSd8YULF2rSpEm+6AkAACDgeB2a/vnPf6pv3751xi+55BL94x//8ElTAAAAgcbr0HTw4EGPK+dq2e12HThwwCdNAQAABBqvQ1Pnzp2VnZ1dZ3zlypUsbgkAAM5YXk8Ez8jI0Pjx47V//35dddVVkqScnBw98cQTmj9/vq/7AwAACAheh6bRo0erqqpKDz/8sB588EFJUmxsrJ577jmNHDnS5w0CAAAEgtP6wd5x48bpu+++U3FxscrLy/Wf//zntALTc889p+7du8tut8tut8vpdGrlypXu7UeOHFF6erpatWqlZs2aaciQISouLvbYR2FhoVJTU9WkSRNFRkbqrrvu0rFjxzxq1q1bp549eyo0NFSdO3dWZmZmnV4WLVqk2NhYhYWFKSkpSZs2bfL69QAAgDPXaYWmY8eOac2aNXrzzTdlWZYkae/evaqoqPBqP+3bt9fcuXOVn5+vTz75RFdddZWuv/56ffHFF5KkyZMn691339WyZcuUm5urvXv3avDgwe7H19TUKDU1VdXV1froo4/08ssvKzMzUzNnznTX7Ny5U6mpqbryyiu1detWTZo0SWPHjtWqVavcNa+//royMjI0a9YsbdmyRfHx8UpJSVFJScnpvD0AAOAMZLNqU4+h3bt3a8CAASosLFRVVZW++uornXvuuZo4caKqqqq0ePHi39RQy5Yt9dhjj2no0KFq06aNli5dqqFDh0qSduzYoa5duyovL099+vTRypUrde2112rv3r2KioqSJC1evFhTp07V/v37FRISoqlTpyorK0vbt293P8fw4cNVWlrqntCelJSkXr16aeHChZJ+Xv08JiZGEyZM0LRp007YZ1VVlaqqqtz3y8vLFRMTo7KyMtnt9t/0HgBnuthpWf5u4aywa26qv1sAAl55ebkcDofRv99eH2maOHGiEhMT9cMPPyg8PNw9fsMNNygnJ8f7bv+rpqZGr732miorK+V0OpWfn6+jR48qOTnZXdOlSxd16NBBeXl5kqS8vDxddNFF7sAkSSkpKSovL3cfrcrLy/PYR21N7T6qq6uVn5/vURMUFKTk5GR3zYnMmTNHDofDfYuJiTnt1w4AAAKf16HpX//6l2bMmKGQkBCP8djYWH3//fdeN7Bt2zY1a9ZMoaGhuv322/XWW28pLi5ORUVFCgkJUUREhEd9VFSUioqKJElFRUUegal2e+22k9WUl5fr8OHDOnDggGpqak5YU7uPE5k+fbrKysrctz179nj92gEAQMPh9dVzLpdLNTU1dca/++47NW/e3OsGLrzwQm3dulVlZWX6xz/+obS0NOXm5nq9n99baGioQkND/d0GAAD4nXh9pKl///4e6zHZbDZVVFRo1qxZuuaaa7xuICQkRJ07d1ZCQoLmzJmj+Ph4LViwQNHR0aqurlZpaalHfXFxsaKjoyVJ0dHRda6mq71/qhq73a7w8HC1bt1awcHBJ6yp3QcAAIDXoemJJ57Qhg0bFBcXpyNHjuimm25yn5p79NFHf3NDLpdLVVVVSkhIUOPGjT3mSRUUFKiwsFBOp1OS5HQ6tW3bNo+r3FavXi273a64uDh3zS/nWq1evdq9j5CQECUkJHjUuFwu5eTkuGsAAAC8Pj3Xvn17ffbZZ3rttdf0+eefq6KiQmPGjNGIESM8JoabmD59uq6++mp16NBBP/74o5YuXap169Zp1apVcjgcGjNmjDIyMtSyZUvZ7XZNmDBBTqdTffr0kfTzUa+4uDjdfPPNmjdvnoqKijRjxgylp6e7T53dfvvtWrhwoe6++26NHj1aH3zwgd544w1lZf3f1TsZGRlKS0tTYmKievfurfnz56uyslKjRo3y9u0BAABnKK9DkyQ1atRI//M///Obn7ykpEQjR47Uvn375HA41L17d61atUp//OMfJUlPPfWUgoKCNGTIEFVVVSklJUXPPvus+/HBwcFasWKFxo0bJ6fTqaZNmyotLU2zZ89213Tq1ElZWVmaPHmyFixYoPbt2+ull15SSkqKu2bYsGHav3+/Zs6cqaKiIvXo0UPZ2dl1JocDAICzl9E6Te+8847xDq+77rrf1FBD5c06D8DZjnWafh+s0wScmjf/fhsdaRo0aJDRE9tsthNeWQcAANDQGYUml8tV330AAAAEtNP67TkAAICzzWlNBK+srFRubq4KCwtVXV3tse2OO+7wSWMAAACBxOvQ9Omnn+qaa67RTz/9pMrKSrVs2VIHDhxQkyZNFBkZSWgCAABnJK9Pz02ePFkDBw50/2Dvxx9/rN27dyshIUGPP/54ffQIAADgd16Hpq1bt+rOO+9UUFCQgoODVVVVpZiYGM2bN0/33HNPffQIAADgd16HpsaNGyso6OeHRUZGqrCwUJLkcDi0Z88e33YHAAAQILye03TxxRdr8+bNOv/883X55Zdr5syZOnDggP7+97+rW7du9dEjAACA33l9pOmRRx5R27ZtJUkPP/ywWrRooXHjxmn//v164YUXfN4gAABAIPD6SFNiYqL778jISGVnZ/u0IQAAgEDE4pYAAAAGjI409ezZUzk5OWrRooUuvvhi2Wy2X63dsmWLz5oDAAAIFEah6frrr1doaKgk8x/vBQAAOJMYhaZZs2ad8G8AAICzBXOaAAAADBgdaWrRosVJ5zEd79ChQ7+pIQDeiZ2W5e8WAOCsYBSa5s+f7/774MGDeuihh5SSkiKn0ylJysvL06pVq3TffffVS5MAAAD+ZrMsy/LmAUOGDNGVV16p8ePHe4wvXLhQa9as0fLly33ZX4NRXl4uh8OhsrIy2e12f7eDswhHmvBrds1N9XcLQMDz5t9vr+c0rVq1SgMGDKgzPmDAAK1Zs8bb3QEAADQIXoemVq1a6e23364z/vbbb6tVq1Y+aQoAACDQeP0zKg888IDGjh2rdevWKSkpSZK0ceNGZWdn68UXX/R5gwAAAIHA69B0yy23qGvXrnr66af15ptvSpK6du2qDz/80B2iAAAAzjRehyZJSkpK0iuvvOLrXgAAAAIWi1sCAAAYIDQBAAAYIDQBAAAYMApNn3/+uVwuV333AgAAELCMQtPFF1+sAwcOSJLOPfdcHTx4sF6bAgAACDRGoSkiIkI7d+6UJO3atYujTgAA4KxjtOTAkCFDdPnll6tt27ay2WxKTExUcHDwCWv/85//+LRBAACAQGAUml544QUNHjxY33zzje644w7deuutat68eX33BgAAEDCMF7es/ZHe/Px8TZw4kdAEAADOKl6vCL5kyRL33999950kqX379r7rCAAAIAB5vU6Ty+XS7Nmz5XA41LFjR3Xs2FERERF68MEHmSAOAADOWF4fabr33nv117/+VXPnzlXfvn0lSR9++KHuv/9+HTlyRA8//LDPmwQAAPA3r0PTyy+/rJdeeknXXXede6x79+4655xz9Je//IXQBAAAzkhen547dOiQunTpUme8S5cuOnTokE+aAgAACDReh6b4+HgtXLiwzvjChQsVHx/vk6YAAAACjden5+bNm6fU1FStWbNGTqdTkpSXl6c9e/bovffe83mDAAAAgcDrI02XX365vvrqK91www0qLS1VaWmpBg8erIKCAl166aX10SMAAIDfeX2kSZLatWvHhG8AAHBW8fpIEwAAwNmI0AQAAGCA0AQAAGDAq9BkWZYKCwt15MiR+uoHAAAgIHkdmjp37qw9e/bUVz8AAAAByavQFBQUpPPPP18HDx6sr34AAAACktdzmubOnau77rpL27dvr49+AAAAApLX6zSNHDlSP/30k+Lj4xUSEqLw8HCP7fz+HAAAOBN5HZrmz59fD20AAAAENq9DU1paWn30AQAAENBOa52mb7/9VjNmzNCNN96okpISSdLKlSv1xRdf+LQ5AACAQOF1aMrNzdVFF12kjRs36s0331RFRYUk6bPPPtOsWbN83iAAAEAg8Do0TZs2TQ899JBWr16tkJAQ9/hVV12ljz/+2KfNAQAABAqvQ9O2bdt0ww031BmPjIzUgQMHfNIUAABAoPE6NEVERGjfvn11xj/99FOdc845PmkKAAAg0HgdmoYPH66pU6eqqKhINptNLpdLGzZs0JQpUzRy5Mj66BEAAMDvvA5NjzzyiLp06aKYmBhVVFQoLi5Ol112mS655BLNmDGjPnoEAADwO6/XaQoJCdGLL76o++67T9u3b1dFRYUuvvhinX/++fXRHwAAQEDwOjTV6tChg2JiYiRJNpvNZw0BAAAEotNa3PKvf/2runXrprCwMIWFhalbt2566aWXfN0bAABAwPD6SNPMmTP15JNPasKECXI6nZKkvLw8TZ48WYWFhZo9e7bPmwQAAPA3r0PTc889pxdffFE33nije+y6665T9+7dNWHCBEITAAA4I3l9eu7o0aNKTEysM56QkKBjx475pCkAAIBA43Vouvnmm/Xcc8/VGX/hhRc0YsQIr/Y1Z84c9erVS82bN1dkZKQGDRqkgoICj5ojR44oPT1drVq1UrNmzTRkyBAVFxd71BQWFio1NVVNmjRRZGSk7rrrrjoBbt26derZs6dCQ0PVuXNnZWZm1uln0aJFio2NVVhYmJKSkrRp0yavXg8AADhzGZ2ey8jIcP9ts9n00ksv6f3331efPn0kSRs3blRhYaHXi1vm5uYqPT1dvXr10rFjx3TPPfeof//++ve//62mTZtKkiZPnqysrCwtW7ZMDodD48eP1+DBg7VhwwZJUk1NjVJTUxUdHa2PPvpI+/bt08iRI9W4cWM98sgjkqSdO3cqNTVVt99+u1555RXl5ORo7Nixatu2rVJSUiRJr7/+ujIyMrR48WIlJSVp/vz5SklJUUFBgSIjI716XQAA4MxjsyzLOlXRlVdeabYzm00ffPDBaTezf/9+RUZGKjc3V5dddpnKysrUpk0bLV26VEOHDpUk7dixQ127dlVeXp769OmjlStX6tprr9XevXsVFRUlSVq8eLGmTp2q/fv3KyQkRFOnTlVWVpa2b9/ufq7hw4ertLRU2dnZkqSkpCT16tVLCxculCS5XC7FxMRowoQJmjZt2il7Ly8vl8PhUFlZmex2+2m/B4C3Yqdl+bsFBKhdc1P93QIQ8Lz599voSNPatWt90tiplJWVSZJatmwpScrPz9fRo0eVnJzsrunSpYs6dOjgDk15eXm66KKL3IFJklJSUjRu3Dh98cUXuvjii5WXl+exj9qaSZMmSZKqq6uVn5+v6dOnu7cHBQUpOTlZeXl5J+y1qqpKVVVV7vvl5eW/7cUDAICAdlrrNNUHl8ulSZMmqW/fvurWrZskqaioSCEhIYqIiPCojYqKUlFRkbvm+MBUu71228lqysvLdfjwYR04cEA1NTUnrKndxy/NmTNHDofDfatd6BMAAJyZvF5y4MiRI3rmmWe0du1alZSUyOVyeWzfsmXLaTWSnp6u7du368MPPzytx//epk+f7jHXq7y8nOAEAMAZzOvQNGbMGL3//vsaOnSoevfu7ZOfUBk/frxWrFih9evXq3379u7x6OhoVVdXq7S01ONoU3FxsaKjo901v7zKrfbquuNrfnnFXXFxsex2u8LDwxUcHKzg4OAT1tTu45dCQ0MVGhp6ei8YAAA0OF6HphUrVui9995T3759f/OTW5alCRMm6K233tK6devUqVMnj+0JCQlq3LixcnJyNGTIEElSQUGBCgsL3auRO51OPfzwwyopKXFf5bZ69WrZ7XbFxcW5a9577z2Pfa9evdq9j5CQECUkJCgnJ0eDBg2S9PPpwpycHI0fP/43v04AANDweR2azjnnHDVv3twnT56enq6lS5fq7bffVvPmzd3zhxwOh8LDw+VwODRmzBhlZGSoZcuWstvt7p9vqV3uoH///oqLi9PNN9+sefPmqaioSDNmzFB6err7SNDtt9+uhQsX6u6779bo0aP1wQcf6I033lBW1v9ddZSRkaG0tDQlJiaqd+/emj9/viorKzVq1CifvFYAANCweR2annjiCU2dOlWLFy9Wx44df9OT1y6SecUVV3iML1myRLfccosk6amnnlJQUJCGDBmiqqoqpaSk6Nlnn3XXBgcHa8WKFRo3bpycTqeaNm2qtLQ0j59z6dSpk7KysjR58mQtWLBA7du310svveReo0mShg0bpv3792vmzJkqKipSjx49lJ2dXWdyOAAAODsZrdN0vP379+vPf/6z1q9fryZNmqhx48Ye2w8dOuTTBhsK1mmCv7BOE34N6zQBp+bzdZqOd+ONN+r777/XI488oqioKJ9MBAcAAAh0Xoemjz76SHl5eYqPj6+PfgAAAAKS14tbdunSRYcPH66PXgAAAAKW16Fp7ty5uvPOO7Vu3TodPHhQ5eXlHjcAAIAzkden5wYMGCBJ6tevn8e4ZVmy2WyqqanxTWcAAAABxOvQ9Hv9eC8AAEAg8To0XX755fXRBwAAQEDzOjStX7/+pNsvu+yy024GAAAgUHkdmn65erckj7WamNMEAADORF5fPffDDz943EpKSpSdna1evXrp/fffr48eAQAA/M7rI00Oh6PO2B//+EeFhIQoIyND+fn5PmkMAAAgkHh9pOnXREVFqaCgwFe7AwAACCheH2n6/PPPPe5blqV9+/Zp7ty56tGjh6/6AgAACCheh6YePXrIZrPJsiyP8T59+uhvf/ubzxoDAAAIJF6Hpp07d3rcDwoKUps2bRQWFuazpgAAAAKN16GpY8eO9dEHAABAQPM6NElSTk6OcnJyVFJSIpfL5bGNU3QAAOBM5HVoeuCBBzR79mwlJiaqbdu2HgtbAgAAnKm8Dk2LFy9WZmambr755vroBwAAICB5vU5TdXW1LrnkkvroBQAAIGB5HZrGjh2rpUuX1kcvAAAAAcvr03NHjhzRCy+8oDVr1qh79+5q3Lixx/Ynn3zSZ80BAAAEitNaEbx25e/t27d7bGNSOAAAOFN5HZrWrl1bH30AAAAENJ/9YC8AAMCZjNAEAABggNAEAABggNAEAABggNAEAABggNAEAABggNAEAABgwOt1mgAADUPstCx/t+C1XXNT/d0C8Ks40gQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGCA0AQAAGDAr6Fp/fr1GjhwoNq1ayebzably5d7bLcsSzNnzlTbtm0VHh6u5ORkff311x41hw4d0ogRI2S32xUREaExY8aooqLCo+bzzz/XpZdeqrCwMMXExGjevHl1elm2bJm6dOmisLAwXXTRRXrvvfd8/noBAEDD1cifT15ZWan4+HiNHj1agwcPrrN93rx5evrpp/Xyyy+rU6dOuu+++5SSkqJ///vfCgsLkySNGDFC+/bt0+rVq3X06FGNGjVKt912m5YuXSpJKi8vV//+/ZWcnKzFixdr27ZtGj16tCIiInTbbbdJkj766CPdeOONmjNnjq699lotXbpUgwYN0pYtW9StW7ff7w2B38VOy/J3CwCAAGWzLMvydxOSZLPZ9NZbb2nQoEGSfj7K1K5dO915552aMmWKJKmsrExRUVHKzMzU8OHD9eWXXyouLk6bN29WYmKiJCk7O1vXXHONvvvuO7Vr107PPfec7r33XhUVFSkkJESSNG3aNC1fvlw7duyQJA0bNkyVlZVasWKFu58+ffqoR48eWrx48Qn7raqqUlVVlft+eXm5YmJiVFZWJrvd7vP3B78PQhPgX7vmpvq7BZxlysvL5XA4jP79Dtg5TTt37lRRUZGSk5PdYw6HQ0lJScrLy5Mk5eXlKSIiwh2YJCk5OVlBQUHauHGju+ayyy5zByZJSklJUUFBgX744Qd3zfHPU1tT+zwnMmfOHDkcDvctJibmt79oAAAQsAI2NBUVFUmSoqKiPMajoqLc24qKihQZGemxvVGjRmrZsqVHzYn2cfxz/FpN7fYTmT59usrKyty3PXv2ePsSAQBAA+LXOU0NWWhoqEJDQ/3dBgAA+J0EbGiKjo6WJBUXF6tt27bu8eLiYvXo0cNdU1JS4vG4Y8eO6dChQ+7HR0dHq7i42KOm9v6pamq3AwB+Hw1xXmFDnYfFe+29gD0916lTJ0VHRysnJ8c9Vl5ero0bN8rpdEqSnE6nSktLlZ+f76754IMP5HK5lJSU5K5Zv369jh496q5ZvXq1LrzwQrVo0cJdc/zz1NbUPg8AAIBfQ1NFRYW2bt2qrVu3Svp58vfWrVtVWFgom82mSZMm6aGHHtI777yjbdu2aeTIkWrXrp37CruuXbtqwIABuvXWW7Vp0yZt2LBB48eP1/Dhw9WuXTtJ0k033aSQkBCNGTNGX3zxhV5//XUtWLBAGRkZ7j4mTpyo7OxsPfHEE9qxY4fuv/9+ffLJJxo/fvzv/ZYAAIAA5dfTc5988omuvPJK9/3aIJOWlqbMzEzdfffdqqys1G233abS0lL9v//3/5Sdne1eo0mSXnnlFY0fP179+vVTUFCQhgwZoqefftq93eFw6P3331d6eroSEhLUunVrzZw5071GkyRdcsklWrp0qWbMmKF77rlH559/vpYvX84aTQAAwC1g1mlq6LxZ5wGBqyGe4wfgX/6eZ3O6GuL/7+rjvT4j1mkCAAAIJIQmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA4QmAAAAA3797TkAABq6hvhzJDg9HGkCAAAwQGgCAAAwQGgCAAAwQGgCAAAwQGgCAAAwwNVzqDdcUQIAOJNwpAkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMAAoQkAAMBAI383ADOx07L83QIAAGc1jjQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDQBAAAYIDT9wqJFixQbG6uwsDAlJSVp06ZN/m4JAAAEAELTcV5//XVlZGRo1qxZ2rJli+Lj45WSkqKSkhJ/twYAAPyM0HScJ598UrfeeqtGjRqluLg4LV68WE2aNNHf/vY3f7cGAAD8rJG/GwgU1dXVys/P1/Tp091jQUFBSk5OVl5eXp36qqoqVVVVue+XlZVJksrLy+ulP1fVT/WyXwAAGor6+De2dp+WZZ2yltD0XwcOHFBNTY2ioqI8xqOiorRjx4469XPmzNEDDzxQZzwmJqbeegQA4GzmmF9/+/7xxx/lcDhOWkNoOk3Tp09XRkaG+77L5dLu3bvVo0cP7dmzR3a73Y/dwUR5ebliYmL4vBoAPquGg8+qYeHz+vkI048//qh27dqdspbQ9F+tW7dWcHCwiouLPcaLi4sVHR1dpz40NFShoaEeY0FBP08Rs9vtZ+1/fA0Rn1fDwWfVcPBZNSxn++d1qiNMtZgI/l8hISFKSEhQTk6Oe8zlciknJ0dOp9OPnQEAgEDAkabjZGRkKC0tTYmJierdu7fmz5+vyspKjRo1yt+tAQAAPyM0HWfYsGHav3+/Zs6cqaKiIvXo0UPZ2dl1Jof/mtDQUM2aNavOaTsEJj6vhoPPquHgs2pY+Ly8Y7NMrrEDAAA4yzGnCQAAwAChCQAAwAChCQAAwAChCQAAwAChyYcWLVqk2NhYhYWFKSkpSZs2bfJ3S/iF+++/XzabzePWpUsXf7eF/1q/fr0GDhyodu3ayWazafny5R7bLcvSzJkz1bZtW4WHhys5OVlff/21f5o9y53qs7rlllvqfNcGDBjgn2bPcnPmzFGvXr3UvHlzRUZGatCgQSooKPCoOXLkiNLT09WqVSs1a9ZMQ4YMqbPYMwhNPvP6668rIyNDs2bN0pYtWxQfH6+UlBSVlJT4uzX8wh/+8Aft27fPffvwww/93RL+q7KyUvHx8Vq0aNEJt8+bN09PP/20Fi9erI0bN6pp06ZKSUnRkSNHfudOcarPSpIGDBjg8V179dVXf8cOUSs3N1fp6en6+OOPtXr1ah09elT9+/dXZWWlu2by5Ml69913tWzZMuXm5mrv3r0aPHiwH7sOUBZ8onfv3lZ6err7fk1NjdWuXTtrzpw5fuwKvzRr1iwrPj7e323AgCTrrbfect93uVxWdHS09dhjj7nHSktLrdDQUOvVV1/1Q4eo9cvPyrIsKy0tzbr++uv90g9OrqSkxJJk5ebmWpb18/eocePG1rJly9w1X375pSXJysvL81ebAYkjTT5QXV2t/Px8JScnu8eCgoKUnJysvLw8P3aGE/n666/Vrl07nXvuuRoxYoQKCwv93RIM7Ny5U0VFRR7fM4fDoaSkJL5nAWrdunWKjIzUhRdeqHHjxungwYP+bgmSysrKJEktW7aUJOXn5+vo0aMe360uXbqoQ4cOfLd+gdDkAwcOHFBNTU2dlcOjoqJUVFTkp65wIklJScrMzFR2draee+457dy5U5deeql+/PFHf7eGU6j9LvE9axgGDBig//3f/1VOTo4effRR5ebm6uqrr1ZNTY2/WzuruVwuTZo0SX379lW3bt0k/fzdCgkJUUREhEct3626+BkVnFWuvvpq99/du3dXUlKSOnbsqDfeeENjxozxY2fAmWX48OHuvy+66CJ1795d5513ntatW6d+/fr5sbOzW3p6urZv385cztPEkSYfaN26tYKDg+tcaVBcXKzo6Gg/dQUTERERuuCCC/TNN9/4uxWcQu13ie9Zw3TuueeqdevWfNf8aPz48VqxYoXWrl2r9u3bu8ejo6NVXV2t0tJSj3q+W3URmnwgJCRECQkJysnJcY+5XC7l5OTI6XT6sTOcSkVFhb799lu1bdvW363gFDp16qTo6GiP71l5ebk2btzI96wB+O6773Tw4EG+a35gWZbGjx+vt956Sx988IE6derksT0hIUGNGzf2+G4VFBSosLCQ79YvcHrORzIyMpSWlqbExET17t1b8+fPV2VlpUaNGuXv1nCcKVOmaODAgerYsaP27t2rWbNmKTg4WDfeeKO/W4N+DrHHH4nYuXOntm7dqpYtW6pDhw6aNGmSHnroIZ1//vnq1KmT7rvvPrVr106DBg3yX9NnqZN9Vi1bttQDDzygIUOGKDo6Wt9++63uvvtude7cWSkpKX7s+uyUnp6upUuX6u2331bz5s3d85QcDofCw8PlcDg0ZswYZWRkqGXLlrLb7ZowYYKcTqf69Onj5+4DjL8v3zuTPPPMM1aHDh2skJAQq3fv3tbHH3/s75bwC8OGDbPatm1rhYSEWOecc441bNgw65tvvvF3W/ivtWvXWpLq3NLS0izL+nnZgfvuu8+KioqyQkNDrX79+lkFBQX+bfosdbLP6qeffrL69+9vtWnTxmrcuLHVsWNH69Zbb7WKior83fZZ6USfkyRryZIl7prDhw9bf/nLX6wWLVpYTZo0sW644QZr3759/ms6QNksy7J+/6gGAADQsDCnCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCYBPXHHFFZo0aZK/25AkrVu3Tjabrc4PkPrC/fffr6ioKNlsNi1fvvy0+snMzFRERIRXzxsbG6v58+d79RgAvkVoAtCg/Z5h7csvv9QDDzyg559/Xvv27dPVV199WvsZNmyYvvrqKx935x1CGOA9frAXAAx9++23kqTrr79eNpvttPcTHh6u8PBwX7UF4HfCkSYA9aKqqkpTpkzROeeco6ZNmyopKUnr1q1zb689RbVq1Sp17dpVzZo104ABA7Rv3z53zbFjx3THHXcoIiJCrVq10tSpU5WWlqZBgwZJkm655Rbl5uZqwYIFstlsstls2rVrl/vx+fn5SkxMVJMmTXTJJZeooKDgpD1v27ZNV111lcLDw9WqVSvddtttqqiokPTzabmBAwdKkoKCgk4amt577z1dcMEFCg8P15VXXunR0/Gvvda3336r66+/XlFRUWrWrJl69eqlNWvWnLTX0tJSjR07Vm3atJHdbtdVV12lzz77zKPm3XffVa9evRQWFqbWrVvrhhtukPTz0bndu3dr8uTJ7vcNwKkRmgDUi/HjxysvL0+vvfaaPv/8c/3pT3/SgAED9PXXX7trfvrpJz3++OP6+9//rvXr16uwsFBTpkxxb3/00Uf1yiuvaMmSJdqwYYPKy8s95hEtWLBATqdTt956q/bt26d9+/YpJibGvf3ee+/VE088oU8++USNGjXS6NGjf7XfyspKpaSkqEWLFtq8ebOWLVumNWvWaPz48ZKkKVOmaMmSJZLkfq4T2bNnjwYPHqyBAwdq69atGjt2rKZNm3bS96qiokLXXHONcnJy9Omnn2rAgAEaOHCgCgsLf/Uxf/rTn1RSUqKVK1cqPz9fPXv2VL9+/XTo0CFJUlZWlm644QZdc801+vTTT5WTk6PevXtLkt588021b99es2fPPulrAfALFgD4wOWXX25NnDjRsizL2r17txUcHGx9//33HjX9+vWzpk+fblmWZS1ZssSSZH3zzTfu7YsWLbKioqLc96OioqzHHnvMff/YsWNWhw4drOuvv/6Ez1tr7dq1liRrzZo17rGsrCxLknX48OET9v/CCy9YLVq0sCoqKjweExQUZBUVFVmWZVlvvfWWdar/bU6fPt2Ki4vzGJs6daolyfrhhx/cr93hcJx0P3/4wx+sZ555xn2/Y8eO1lNPPWVZlmX961//sux2u3XkyBGPx5x33nnW888/b1mWZTmdTmvEiBG/uv/j9wfADHOaAPjctm3bVFNTowsuuMBjvKqqSq1atXLfb9Kkic477zz3/bZt26qkpESSVFZWpuLiYvfREUkKDg5WQkKCXC6XUR/du3f32LcklZSUqEOHDnVqv/zyS8XHx6tp06busb59+8rlcqmgoEBRUVFGz/nll18qKSnJY8zpdJ70MRUVFbr//vuVlZWlffv26dixYzp8+PCvHmn67LPPVFFR4fFeStLhw4fd8662bt2qW2+91ahnAGYITQB8rqKiQsHBwcrPz1dwcLDHtmbNmrn/bty4scc2m80my7J81sfx+6+dt2MauH5PU6ZM0erVq/X444+rc+fOCg8P19ChQ1VdXX3C+oqKCrVt29Zjjlit2rlSTDQHfI/QBMDnLr74YtXU1KikpESXXnrpae3D4XAoKipKmzdv1mWXXSZJqqmp0ZYtW9SjRw93XUhIiGpqan5zz127dlVmZqYqKyvdR5s2bNigoKAgXXjhhV7t55133vEY+/jjj0/6mA0bNuiWW25xT9SuqKioM3n8eD179lRRUZEaNWqk2NjYE9Z0795dOTk5GjVq1Am3++p9A84mTAQH4HMXXHCBRowYoZEjR+rNN9/Uzp07tWnTJs2ZM0dZWVnG+5kwYYLmzJmjt99+WwUFBZo4caJ++OEHj6u9YmNjtXHjRu3atUsHDhw47SNJI0aMUFhYmNLS0rR9+3atXbtWEyZM0M0332x8ak6Sbr/9dn399de66667VFBQoKVLlyozM/Okjzn//PP15ptvauvWrfrss8900003nfR1JCcny+l0atCgQXr//fe1a9cuffTRR7r33nv1ySefSJJmzZqlV199VbNmzdKXX36pbdu26dFHH3XvIzY2VuvXr9f333+vAwcOGL8+4GxGaAJQL5YsWaKRI0fqzjvv1IUXXqhBgwZp8+bNJ5xP9GumTp2qG2+8USNHjpTT6VSzZs2UkpKisLAwd82UKVMUHBysuLg4tWnT5qRXnJ1MkyZNtGrVKh06dEi9evXS0KFD1a9fPy1cuNCr/XTo0EH//Oc/tXz5csXHx2vx4sV65JFHTvqYJ598Ui1atNAll1yigQMHKiUlRT179vzVepvNpvfee0+XXXaZRo0apQsuuEDDhw/X7t273QHviiuu0LJly/TOO++oR48euuqqq7Rp0yb3PmbPnq1du3bpvPPOU5s2bbx6jcDZymb5cgIBANQjl8ulrl276s9//rMefPBBf7cD4CzDnCYAAWv37t16//33dfnll6uqqkoLFy7Uzp07ddNNN/m7NQBnIU7PAQhYQUFByszMVK9evdS3b19t27ZNa9asUdeuXf3dGoCzEKfnAAAADHCkCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwAChCQAAwMD/B3amTj7/NKyzAAAAAElFTkSuQmCC\n"},"metadata":{}}]},{"cell_type":"code","source":["plt.hist(sentenceLengths(filtered_standard), bins=10)\n","plt.xlabel('length of standard')\n","plt.ylabel('number of standard')\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"id":"2g430fC7RliO","executionInfo":{"status":"ok","timestamp":1718494938349,"user_tz":-540,"elapsed":1312,"user":{"displayName":"김범진","userId":"02150140531333380287"}},"outputId":"48af15ea-d43b-4dff-ed04-074c70c52752"},"execution_count":27,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["SOS_token = 0\n","EOS_token = 0\n","\n","class Lang:\n"," def __init__(self, name):\n"," self.name = name\n"," self.word2index = {}\n"," self.word2count = {}\n"," self.index2word = {0: \"SOS\", 1: \"EOS\"}\n"," self.n_words = 2 # SOS, EOS\n","\n"," def addSentence(self, sentence):\n"," for word in sentence.split(\" \"):\n"," self.addWord(word)\n","\n"," def addWord(self, word):\n"," if word not in self.word2index:\n"," self.word2index[word] = self.n_words\n"," self.word2count[word] = 1\n"," self.index2word[self.n_words] = word\n"," self.n_words += 1\n"," else:\n"," self.word2count[word] += 1"],"metadata":{"id":"oMl0xGNU49XX","executionInfo":{"status":"ok","timestamp":1718495009410,"user_tz":-540,"elapsed":411,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":28,"outputs":[]},{"cell_type":"code","source":["# Lang 객체 생성\n","dialect_lang = Lang(\"Dialect\")\n","standard_lang = Lang(\"Standard\")\n","\n","# 문장 추가\n","for sentence in filtered_dialect:\n"," dialect_lang.addSentence(sentence)\n","for sentence in filtered_standard:\n"," standard_lang.addSentence(sentence)\n","for sentence in filtered_df_VL['src']:\n"," dialect_lang.addSentence(sentence)\n","for sentence in filtered_df_VL['tar']:\n"," standard_lang.addSentence(sentence)\n","\n","# 문장\n","pairs = list(zip(filtered_dialect, filtered_standard))\n","VL_pairs = list(zip(filtered_df_VL['src'], filtered_df_VL['tar']))\n","\n","# 문장을 인덱스로 변환\n","def indexesFromSentence(lang, sentence):\n"," return [lang.word2index[word] for word in sentence.split(' ')]\n","\n","def tensorFromSentence(lang, sentence):\n"," indexes = indexesFromSentence(lang, sentence)\n"," indexes.append(EOS_token)\n"," if len(indexes) < max_len:\n"," indexes.extend([EOS_token] * (max_len - len(indexes))) # 패딩 추가\n"," return torch.tensor(indexes[:max_len], dtype=torch.long).view(-1, 1)\n","\n","def tensorsFromPair(pair):\n"," input_tensor = tensorFromSentence(dialect_lang, pair[0])\n"," target_tensor = tensorFromSentence(standard_lang, pair[1])\n"," return (input_tensor, target_tensor)"],"metadata":{"id":"VBPYjCbZ8l6k","executionInfo":{"status":"ok","timestamp":1718495461417,"user_tz":-540,"elapsed":5024,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":30,"outputs":[]},{"cell_type":"code","source":["import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","\n","# 검증 데이터를 인덱스로 변환\n","validation_input_tensors = [tensorFromSentence(dialect_lang, pair[0]) for pair in VL_pairs]\n","validation_target_tensors = [tensorFromSentence(standard_lang, pair[1]) for pair in VL_pairs]\n","\n","class EncoderRNN(nn.Module):\n"," def __init__(self, input_size, hidden_size):\n"," super(EncoderRNN, self).__init__()\n"," self.hidden_size = hidden_size\n"," self.embedding = nn.Embedding(input_size, hidden_size)\n"," self.lstm = nn.LSTM(hidden_size, hidden_size)\n","\n"," def forward(self, input, hidden):\n"," embedded = self.embedding(input).view(1, 1, -1)\n"," output, hidden = self.lstm(embedded, hidden)\n"," return output, hidden\n","\n"," def initHidden(self):\n"," return (torch.zeros(1, 1, self.hidden_size),\n"," torch.zeros(1, 1, self.hidden_size))\n","\n","class AttnDecoderRNN(nn.Module):\n"," def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=max_len):\n"," super(AttnDecoderRNN, self).__init__()\n"," self.hidden_size = hidden_size\n"," self.output_size = output_size\n"," self.dropout_p = dropout_p\n"," self.max_length = max_length\n","\n"," self.embedding = nn.Embedding(self.output_size, self.hidden_size)\n"," self.attn = nn.Linear(self.hidden_size * 2, self.max_length)\n"," self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)\n"," self.dropout = nn.Dropout(self.dropout_p)\n"," self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)\n"," self.out = nn.Linear(self.hidden_size, self.output_size)\n","\n"," def forward(self, input, hidden, encoder_outputs):\n"," embedded = self.embedding(input).view(1, 1, -1)\n"," embedded = self.dropout(embedded)\n","\n"," attn_weights = nn.functional.softmax(\n"," self.attn(torch.cat((embedded[0], hidden[0][0]), 1)), dim=1)\n"," attn_applied = torch.bmm(attn_weights.unsqueeze(0),\n"," encoder_outputs.unsqueeze(0))\n","\n"," output = torch.cat((embedded[0], attn_applied[0]), 1)\n"," output = self.attn_combine(output).unsqueeze(0)\n","\n"," output = nn.functional.relu(output)\n"," output, hidden = self.lstm(output, hidden)\n","\n"," output = nn.functional.log_softmax(self.out(output[0]), dim=1)\n"," return output, hidden, attn_weights\n","\n"," def initHidden(self):\n"," return (torch.zeros(1, 1, self.hidden_size),\n"," torch.zeros(1, 1, self.hidden_size))"],"metadata":{"id":"EyqODVGn87BL","executionInfo":{"status":"ok","timestamp":1718495480263,"user_tz":-540,"elapsed":6681,"user":{"displayName":"김범진","userId":"02150140531333380287"}}},"execution_count":31,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"UPHbq6_PL11X"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import random\n","\n","def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=max_len):\n"," encoder_hidden = encoder.initHidden()\n","\n"," encoder_optimizer.zero_grad()\n"," decoder_optimizer.zero_grad()\n","\n"," input_length = input_tensor.size(0)\n"," target_length = target_tensor.size(0)\n","\n"," encoder_outputs = torch.zeros(max_length, encoder.hidden_size)\n","\n"," loss = 0\n","\n"," for ei in range(input_length):\n"," encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)\n"," encoder_outputs[ei] = encoder_output[0, 0]\n","\n"," decoder_input = torch.tensor([[SOS_token]])\n","\n"," decoder_hidden = encoder_hidden\n","\n"," for di in range(target_length):\n"," decoder_output, decoder_hidden, decoder_attention = decoder(\n"," decoder_input, decoder_hidden, encoder_outputs)\n"," topv, topi = decoder_output.topk(1)\n"," decoder_input = topi.squeeze().detach() # 다음 입력으로 사용\n","\n"," loss += criterion(decoder_output, target_tensor[di])\n"," if decoder_input.item() == EOS_token:\n"," break\n","\n"," loss.backward()\n","\n"," encoder_optimizer.step()\n"," decoder_optimizer.step()\n","\n"," return loss.item() / target_length\n","\n","def trainIters(encoder, decoder, n_iters, print_every=1000, learning_rate=0.01):\n"," encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)\n"," decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)\n"," training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]\n"," criterion = nn.NLLLoss()\n","\n"," for iter in range(1, n_iters + 1):\n"," training_pair = training_pairs[iter - 1]\n"," input_tensor = training_pair[0]\n"," target_tensor = training_pair[1]\n","\n"," loss = train(input_tensor, target_tensor, encoder,\n"," decoder, encoder_optimizer, decoder_optimizer, criterion)\n"," if iter % print_every == 0:\n"," print(f'Iteration: {iter}, Loss: {loss}')"],"metadata":{"id":"Uaozw3dc_vdk"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 모델 초기화 및 훈련\n","hidden_size = 256\n","encoder = EncoderRNN(dialect_lang.n_words, hidden_size)\n","decoder = AttnDecoderRNN(hidden_size, standard_lang.n_words, dropout_p=0.1)\n","\n","trainIters(encoder, decoder, 75000, print_every=5000)"],"metadata":{"id":"JLgmcaB5UKtN"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def saveModel(encoder, decoder, encoder_path='encoder.pth', decoder_path='decoder.pth'): ## 모델 저장\n"," torch.save(encoder.state_dict(), encoder_path)\n"," torch.save(decoder.state_dict(), decoder_path)\n","\n","def loadModel(encoder_path='encoder.pth', decoder_path='decoder.pth'): ## 모델 로드\n"," encoder = EncoderRNN(dialect_lang.n_words, hidden_size)\n"," decoder = AttnDecoderRNN(hidden_size, standard_lang.n_words, dropout_p=0.1)\n"," encoder.load_state_dict(torch.load(encoder_path))\n"," decoder.load_state_dict(torch.load(decoder_path))\n"," return encoder, decoder\n"],"metadata":{"id":"_Gjpck4MUHBm"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 테스트 함수\n","\n","def evaluate(encoder, decoder, sentence, max_length=max_len):\n"," with torch.no_grad():\n"," input_tensor = tensorFromSentence(dialect_lang, sentence)\n"," input_length = input_tensor.size()[0]\n"," encoder_hidden = encoder.initHidden()\n","\n"," encoder_outputs = torch.zeros(max_length, encoder.hidden_size)\n","\n"," for ei in range(input_length):\n"," encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)\n"," encoder_outputs[ei] = encoder_output[0, 0]\n","\n"," decoder_input = torch.tensor([[SOS_token]]) # SOS token\n"," decoder_hidden = encoder_hidden\n","\n"," decoded_words = []\n"," decoder_attentions = torch.zeros(max_length, max_length)\n","\n"," for di in range(max_length):\n"," decoder_output, decoder_hidden, decoder_attention = decoder(\n"," decoder_input, decoder_hidden, encoder_outputs)\n"," decoder_attentions[di] = decoder_attention.data\n"," topv, topi = decoder_output.data.topk(1)\n"," if topi.item() == EOS_token:\n"," decoded_words.append('')\n"," break\n"," else:\n"," decoded_words.append(standard_lang.index2word[topi.item()])\n","\n"," decoder_input = topi.squeeze().detach()\n","\n"," return decoded_words\n","\n","def evaluateRandomly(encoder, decoder, n=10):\n"," for i in range(n):\n"," pair = random.choice(test_pairs)\n"," print('Dialect:', pair[0])\n"," print('Expected:', pair[1])\n"," output_words = evaluate(encoder, decoder, pair[0])\n"," output_sentence = ' '.join(output_words)\n"," print('Predicted:', output_sentence)\n"," print('')\n"],"metadata":{"id":"zYySN_5AUvbG"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["## 테스트 데이터 준비 필요\n","test_dialect_sentences = []\n","test_standard_sentences = []\n","\n","test_pairs = list(zip(test_dialect_sentences, test_standard_sentences))"],"metadata":{"id":"ch8xAa69U5DA"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["## 테스트 함수 실행\n","evaluateRandomly(encoder, decoder, n=len(test_pairs))"],"metadata":{"id":"JQNbhsGTVRCe"},"execution_count":null,"outputs":[]}]} \ No newline at end of file +{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":23937,"status":"ok","timestamp":1718494161844,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"Ix1Jbg_xWTxg","outputId":"b27e89ee-70d9-4b93-9b09-d96eb688bf0f"},"outputs":[{"name":"stdout","output_type":"stream","text":["Mounted at /content/drive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":2,"metadata":{"executionInfo":{"elapsed":7382,"status":"ok","timestamp":1718494169225,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"xPCQBU1BWfcw"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import os\n","import json\n","import csv\n","\n","TL_sentence_path = '/content/drive/MyDrive/LSTM+attention/sentence_dataTL.csv'\n","VL_sentence_path = '/content/drive/MyDrive/LSTM+attention/sentence_dataVL.csv'\n","\n","# data파일 불러오기\n","TL_sentence_data = pd.read_csv(TL_sentence_path, encoding='utf-8')\n","VL_sentence_data = pd.read_csv(VL_sentence_path, encoding='utf-8')\n","\n","# 중복 제거, Pronuncication 열은 필요 없다고 생각\n","TL_sentence_data.drop('Pronunciation', axis=1, inplace=True)\n","TL_sentence_data = TL_sentence_data.drop_duplicates().reset_index(drop=True)\n","VL_sentence_data.drop('Pronunciation', axis=1, inplace=True)\n","VL_sentence_data = VL_sentence_data.drop_duplicates().reset_index(drop=True)"]},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"elapsed":4,"status":"ok","timestamp":1718494169226,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"UZGjs0aPXrCe","outputId":"d8d38195-d138-43da-ab0a-6ba293dc58ca"},"outputs":[{"data":{"application/vnd.google.colaboratory.intrinsic+json":{"summary":"{\n \"name\": \"TL_sentence_data[:5]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Dialect\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\uc7a5\\ub840 \\uad00\\ub828\\ud574\\uc11c \\ucd08\\uc0c1\\uc9d1 \\uac19\\uc740 \\ub370 \\uac00\\uac00 \\ud558\\uc9c0 \\ub9d0\\uc544\\uc57c \\ub370\\ub294 \\uae30 \\uc788\\uc2b5\\ub2c8\\uaef4\",\n \"\\uc774 \\uad6c\\ub450 \\ud558\\ub098\\ub9cc \\uacc4\\uc18d \\uc2e0\\uace0 \\ub315\\uae30\\uc774\\uaebc\\ub124 \\uc778\\uc790 \\uad7d\\uc774 \\ub9ce\\uc774 \\ub2f3\\uc544\\uc11c \\uac08\\uc544\\uc57c \\ub418\\uaca0\\ub124\",\n \"\\uc608\\uc804\\uc5d0\\ub294 \\uc9d1 \\uc548\\uc5d0\\uc11c \\uc5ec\\uc790\\ub4e4\\uc774 \\ub0a8\\uc790 \\uc704\\ub85c \\ub760\\ub118\\uc73c\\uba74 \\uc548 \\ub374\\ub2e4 \\ucea4\\uc2b5\\ub2c8\\uaef4\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Standard\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\uc7a5\\ub840 \\uad00\\ub828\\ud574\\uc11c \\ucd08\\uc0c1\\uc9d1 \\uac19\\uc740 \\ub370 \\uac00\\uc11c \\ud558\\uc9c0 \\ub9d0\\uc544\\uc57c \\ub370\\ub294 \\uac8c \\uc788\\uc2b5\\ub2c8\\uae4c\",\n \"\\uc774 \\uad6c\\ub450 \\ud558\\ub098\\ub9cc \\uacc4\\uc18d \\uc2e0\\uace0 \\ub2e4\\ub2c8\\ub2c8\\uae4c \\uc774\\uc81c \\uad7d\\uc774 \\ub9ce\\uc774 \\ub2f3\\uc544\\uc11c \\uac08\\uc544\\uc57c \\ub418\\uaca0\\ub124\",\n \"\\uc608\\uc804\\uc5d0\\ub294 \\uc9d1 \\uc548\\uc5d0\\uc11c \\uc5ec\\uc790\\ub4e4\\uc774 \\ub0a8\\uc790 \\uc704\\ub85c \\ub6f0\\uc5b4\\ub118\\uc73c\\uba74 \\uc548 \\ub41c\\ub2e4 \\ud588\\uc2b5\\ub2c8\\uae4c\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}","type":"dataframe"},"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
DialectStandard
0여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까
1장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까
2예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까
3음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까
4이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"text/plain":[" Dialect \\\n","0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴 \n","1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴 \n","2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴 \n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까 \n","4 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네 \n","\n"," Standard \n","0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까 \n","2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까 \n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까 \n","4 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네 "]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["TL_sentence_data[:5]"]},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1718494169226,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"a0cWFdpxDKN7","outputId":"1f81d833-f41a-44eb-b832-18fb05072e3d"},"outputs":[{"data":{"application/vnd.google.colaboratory.intrinsic+json":{"summary":"{\n \"name\": \"VL_sentence_data[:5]\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Dialect\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\ud608\\uc555\\uc57d\\uc740 \\uc2dc\\uac04\\uc744 \\ub9de\\ucdb0 \\ucc59\\uaca8 \\ub4dc\\uc154\\uc57c\\uc9c0 \\uc548 \\uadf8\\ub7ec\\uba74 \\ud6a8\\uacfc\\uac00 \\uc5c6\\uc2b5\\ub2c8\\ub2e4\",\n \"\\uc61b\\ub0a0\\ubd80\\ud130 \\uc870\\uc0c1\\uafc8\\uc774\\ub098 \\ub3fc\\uc9c0\\uafc8 \\uafb8\\ub9cc \\uc9d1\\uc5d0 \\ub3c8 \\ub9ce\\uc774 \\ub4e4\\uc5b4\\uc628\\ub2e4\\uace0 \\uc88b\\uc544 \\ud574\\uc9c0\\ub85c\",\n \"\\uc9d1\\uc5d0 \\ub3cc\\uc544\\uc640 \\ubcf4\\uc774\\uaebc\\ub124 \\ubb38\\uc774 \\uc5f4\\ub824 \\uc788\\uace0 \\ubf08\\ub2e4\\uc9c0\\uac00 \\uc5f4\\uc5b4\\ub454 \\ub3c8 \\uc804\\ubd80 \\uc5c6\\uc5b4\\uc9c0\\ub358 \\uc5b4\\uc774\\ub5bc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Standard\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"\\ud608\\uc555\\uc57d\\uc740 \\uc2dc\\uac04\\uc744 \\ub9de\\ucdb0 \\ucc59\\uaca8 \\ub4dc\\uc154\\uc57c\\uc9c0 \\uc548 \\uadf8\\ub7ec\\uba74 \\ud6a8\\uacfc\\uac00 \\uc5c6\\uc2b5\\ub2c8\\ub2e4\",\n \"\\uc61b\\ub0a0\\ubd80\\ud130 \\uc870\\uc0c1\\uafc8\\uc774\\ub098 \\ub3fc\\uc9c0\\uafc8 \\uafb8\\uba74 \\uc9d1\\uc5d0 \\ub3c8 \\ub9ce\\uc774 \\ub4e4\\uc5b4\\uc628\\ub2e4\\uace0 \\uc88b\\uc544 \\ud588\\uc8e0\",\n \"\\uc9d1\\uc5d0 \\ub3cc\\uc544\\uc640 \\ubcf4\\ub2c8\\uae4c \\ubb38\\uc774 \\uc5f4\\ub824 \\uc788\\uace0 \\uc11c\\ub78d\\uc774 \\uc5f4\\uc5b4\\ub454 \\ub3c8 \\uc804\\ubd80 \\uc5c6\\uc5b4\\uc9c0\\ub358 \\uc5b4\\uc774\\ub5bc\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}","type":"dataframe"},"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
DialectStandard
0오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요
1혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다
2집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼
3아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐
4옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"text/plain":[" Dialect \\\n","0 오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요 \n","1 혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다 \n","2 집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼 \n","3 아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이 \n","4 옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로 \n","\n"," Standard \n","0 오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요 \n","1 혈압약은 시간을 맞춰 챙겨 드셔야지 안 그러면 효과가 없습니다 \n","2 집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼 \n","3 아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐 \n","4 옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠 "]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["VL_sentence_data[:5]"]},{"cell_type":"code","execution_count":5,"metadata":{"executionInfo":{"elapsed":388,"status":"ok","timestamp":1718494174031,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"jlxCy4d3WyDB"},"outputs":[],"source":["standard_sentences_TL = TL_sentence_data['Standard']\n","dialect_sentences_TL = TL_sentence_data['Dialect']\n","standard_sentences_VL = VL_sentence_data['Standard']\n","dialect_sentences_VL = VL_sentence_data['Dialect']"]},{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":414,"status":"ok","timestamp":1718494179071,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"aj-awCcGFReV","outputId":"196a2ae6-1070-4a64-d754-44e01bfacdb3"},"outputs":[{"data":{"text/plain":["0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까\n","1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까\n","2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까\n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까\n","4 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네\n","Name: Standard, dtype: object"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["standard_sentences_TL[:5]"]},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1,"status":"ok","timestamp":1718494179608,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"Ctrb-c6hFWAX","outputId":"adab1ec3-3891-4521-bc83-5e237fdda4bc"},"outputs":[{"data":{"text/plain":["0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴\n","1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴\n","2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴\n","3 음식을 많이 장만하려고 하면 일손이 모자라서 음식하기가 안 힘들었습니까\n","4 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네\n","Name: Dialect, dtype: object"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["dialect_sentences_TL[:5]"]},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":380},"executionInfo":{"elapsed":5480,"status":"ok","timestamp":1718494186367,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"GEIz3cMTXc76","outputId":"b5a0ae75-e341-471e-eec7-a438080b1c4b"},"outputs":[{"data":{"application/vnd.google.colaboratory.intrinsic+json":{"summary":"{\n \"name\": \"filtered_df_TR[:10]\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"src\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"\\ub17c\\ub450\\ub801\\uc5d0 \\uc804\\uc120\\uc774 \\ub298\\uc5b4\\uc838 \\uc788\\uac70\\ub098 \\uc815\\uc804\\uc774 \\ub410\\uc744 \\ub54c \\ub450\\uaebc\\ube44 \\uc9d1\\uc744 \\ubb34\\uc9dc\\ub85c \\ub9cc\\uc9c0\\ub9cc \\uc704\\ud5d8\\ud569\\ub2c8\\ub354\",\n \"\\uc7a5\\ub840 \\uad00\\ub828\\ud574\\uc11c \\ucd08\\uc0c1\\uc9d1 \\uac19\\uc740 \\ub370 \\uac00\\uac00 \\ud558\\uc9c0 \\ub9d0\\uc544\\uc57c \\ub370\\ub294 \\uae30 \\uc788\\uc2b5\\ub2c8\\uaef4\",\n \"\\uc5ec\\uae30\\uc5d0\\ub294 \\uc625\\uc218\\uac31\\uc774\\uac00 \\uc798 \\ub41c\\ub2e4 \\uce74\\ub358\\ub514 \\uc625\\uc218\\uac31\\uc774 \\ub9d0\\uace0\\ub294 \\ubb34\\uc2e0 \\ub18d\\uc0ac\\ub97c \\ub9c8\\uc774 \\uc9d3\\uc2b5\\ub2c8\\uaef4\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tar\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"\\ub17c\\ub450\\ub801\\uc5d0 \\uc804\\uc120\\uc774 \\ub298\\uc5b4\\uc838 \\uc788\\uac70\\ub098 \\uc815\\uc804\\uc774 \\ub410\\uc744 \\ub54c \\ub450\\uaebc\\ube44 \\uc9d1\\uc744 \\ud568\\ubd80\\ub85c \\ub9cc\\uc9c0\\uba74 \\uc704\\ud5d8\\ud569\\ub2c8\\ub2e4\",\n \"\\uc7a5\\ub840 \\uad00\\ub828\\ud574\\uc11c \\ucd08\\uc0c1\\uc9d1 \\uac19\\uc740 \\ub370 \\uac00\\uc11c \\ud558\\uc9c0 \\ub9d0\\uc544\\uc57c \\ub370\\ub294 \\uac8c \\uc788\\uc2b5\\ub2c8\\uae4c\",\n \"\\uc5ec\\uae30\\uc5d0\\ub294 \\uc625\\uc218\\uc218\\uac00 \\uc798 \\ub41c\\ub2e4 \\ud558\\ub358\\ub370 \\uc625\\uc218\\uc218 \\ub9d0\\uace0\\ub294 \\ubb34\\uc2a8 \\ub18d\\uc0ac\\ub97c \\ub9ce\\uc774 \\uc9d3\\uc2b5\\ub2c8\\uae4c\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}","type":"dataframe"},"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
srctar
0여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까
1장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까
2예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까
3이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네
4콩이파리는 가시가 있어가 꺼끄럽고 뻣뻣하고 묵어 보면 맛이 없어예콩잎은 가시가 있어서 껄끄럽고 뻣뻣하고 먹어 보면 맛이 없어요
5여기에는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴여기에는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까
6여개는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴여기는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까
7음식 먹으만 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니데이음식 먹으면 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니다
8논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 무짜로 만지만 위험합니더논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 함부로 만지면 위험합니다
9딱꾹지를 멈치지도 않고 점들 하는디 이럴 때는 우예 해야 합니껴딱꾹지를 멈추지도 않고 점들 하는데 이럴 때는 어떻게 해야 합니까
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"text/plain":[" src \\\n","0 여기는 옥수갱이 잘 된다 카던디 옥수갱이 말고는 무신 농사를 많이 짓습니껴 \n","1 장례 관련해서 초상집 같은 데 가가 하지 말아야 데는 기 있습니껴 \n","2 예전에는 집 안에서 여자들이 남자 위로 띠넘으면 안 덴다 캤습니껴 \n","3 이 구두 하나만 계속 신고 댕기이꺼네 인자 굽이 많이 닳아서 갈아야 되겠네 \n","4 콩이파리는 가시가 있어가 꺼끄럽고 뻣뻣하고 묵어 보면 맛이 없어예 \n","5 여기에는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴 \n","6 여개는 옥수갱이가 잘 된다 카던디 옥수갱이 말고는 무신 농사를 마이 짓습니껴 \n","7 음식 먹으만 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니데이 \n","8 논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 무짜로 만지만 위험합니더 \n","9 딱꾹지를 멈치지도 않고 점들 하는디 이럴 때는 우예 해야 합니껴 \n","\n"," tar \n","0 여기는 옥수수 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","1 장례 관련해서 초상집 같은 데 가서 하지 말아야 데는 게 있습니까 \n","2 예전에는 집 안에서 여자들이 남자 위로 뛰어넘으면 안 된다 했습니까 \n","3 이 구두 하나만 계속 신고 다니니까 이제 굽이 많이 닳아서 갈아야 되겠네 \n","4 콩잎은 가시가 있어서 껄끄럽고 뻣뻣하고 먹어 보면 맛이 없어요 \n","5 여기에는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","6 여기는 옥수수가 잘 된다 하던데 옥수수 말고는 무슨 농사를 많이 짓습니까 \n","7 음식 먹으면 계속 설사하고 토하고 할 때는 물 많이 잡수고 병원에 가봐야 합니다 \n","8 논두렁에 전선이 늘어져 있거나 정전이 됐을 때 두꺼비 집을 함부로 만지면 위험합니다 \n","9 딱꾹지를 멈추지도 않고 점들 하는데 이럴 때는 어떻게 해야 합니까 "]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["# 학습 데이터 중에서 겹치는 표준어 문장과 방언 문장 제거\n","filtered_data_TR = {\n"," \"src\": [],\n"," \"tar\": []\n","}\n","\n","for i in range(0, len(dialect_sentences_TL)):\n"," if (standard_sentences_TL[i] != dialect_sentences_TL[i]):\n"," filtered_data_TR[\"src\"].append(dialect_sentences_TL[i])\n"," filtered_data_TR[\"tar\"].append(standard_sentences_TL[i])\n","\n","filtered_df_TR = pd.DataFrame(filtered_data_TR)\n","\n","filtered_df_TR[:10]"]},{"cell_type":"code","execution_count":9,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":398},"executionInfo":{"elapsed":957,"status":"ok","timestamp":1718494187317,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"kv006ObsFwYF","outputId":"96b6c975-e694-42fd-d6f5-06908c47094a"},"outputs":[{"data":{"application/vnd.google.colaboratory.intrinsic+json":{"summary":"{\n \"name\": \"filtered_df_VL[:10]\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"src\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"\\uc18c\\ub3c4 \\uc0ac\\ub78c \\ub9e8\\uce58\\ub85c \\uc798 \\uba39\\uc5b4\\uc57c \\uadfc\\uc721\\ub3c4 \\ubd87\\uace0 \\ud798\\ub3c4 \\uc0dd\\uaca8\\uc11c \\uc77c\\uc744 \\uc798 \\ud558\\uc9c0\\uc694\",\n \"\\uc9d1\\uc5d0 \\ub3cc\\uc544\\uc640 \\ubcf4\\uc774\\uaebc\\ub124 \\ubb38\\uc774 \\uc5f4\\ub824 \\uc788\\uace0 \\ubf08\\ub2e4\\uc9c0\\uac00 \\uc5f4\\uc5b4\\ub454 \\ub3c8 \\uc804\\ubd80 \\uc5c6\\uc5b4\\uc9c0\\ub358 \\uc5b4\\uc774\\ub5bc\",\n \"\\ucd0c\\uad6c\\uc219\\uc774\\ub77c \\uc80a\\uc740 \\uc0ac\\ub78c\\ub4e4\\uc740 \\ud568\\ubd80\\ub808 \\uc5c6\\uace0 \\uc804\\ubd80 \\ub178\\uc778\\ub4e4\\ub9cc \\uc788\\uc73c\\uc774\\uaebc\\ub124 \\ub18d\\uc0ac \\uc9d3\\uae30\\uac00 \\ud798\\ub4e4\\uc5b4\\uc694\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tar\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"\\uc18c\\ub3c4 \\uc0ac\\ub78c \\ucc98\\ub7fc \\uc798 \\uba39\\uc5b4\\uc57c \\uadfc\\uc721\\ub3c4 \\ubd87\\uace0 \\ud798\\ub3c4 \\uc0dd\\uaca8\\uc11c \\uc77c\\uc744 \\uc798 \\ud558\\uc9c0\\uc694\",\n \"\\uc9d1\\uc5d0 \\ub3cc\\uc544\\uc640 \\ubcf4\\ub2c8\\uae4c \\ubb38\\uc774 \\uc5f4\\ub824 \\uc788\\uace0 \\uc11c\\ub78d\\uc774 \\uc5f4\\uc5b4\\ub454 \\ub3c8 \\uc804\\ubd80 \\uc5c6\\uc5b4\\uc9c0\\ub358 \\uc5b4\\uc774\\ub5bc\",\n \"\\ucd0c\\uad6c\\uc11d\\uc774\\ub77c \\uc80a\\uc740 \\uc0ac\\ub78c\\ub4e4\\uc740 \\uc544\\uc608 \\uc5c6\\uace0 \\uc804\\ubd80 \\ub178\\uc778\\ub4e4\\ub9cc \\uc788\\uc73c\\ub2c8\\uae4c \\ub18d\\uc0ac \\uc9d3\\uae30\\uac00 \\ud798\\ub4e4\\uc5b4\\uc694\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}","type":"dataframe"},"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
srctar
0오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요
1집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼
2아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐
3옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠
4게얼에 먹을 채소나 과일 같은 것은 어데 보관을 했습니꺼겨울에 먹을 채소나 과일 같은 것은 어디에 보관을 했습니까
5촌구숙이라 젊은 사람들은 함부레 없고 전부 노인들만 있으이꺼네 농사 짓기가 힘들어요촌구석이라 젊은 사람들은 아예 없고 전부 노인들만 있으니까 농사 짓기가 힘들어요
6촌구석이라 젊은 사람들은 한 번이 없고 전부 노인들만 있으니까네 농사 짓기가 힘들어요촌구석이라 젊은 사람들은 한 번이 없고 전부 노인들만 있으니까 농사 짓기가 힘들어요
7소도 사람맨치로 잘 먹어야 근육도 붙고 심도 생겨서 일을 잘 하지로소도 사람처럼 잘 먹어야 근육도 붙고 힘도 생겨서 일을 잘 하지요
8소도 사람 맨치로 잘 먹어야 근육도 붇고 힘도 생겨서 일을 잘 하지요소도 사람 처럼 잘 먹어야 근육도 붇고 힘도 생겨서 일을 잘 하지요
9옷가심을 짜를 때는 미리 선을 끟어 놓아야 쪽바리 잘 자를 수 있어예옷감을 자를 때는 미리 선을 그어 놓아야 똑바로 잘 자를 수 있어요
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"text/plain":[" src \\\n","0 오랫동안 한 동네에서 살았던 할머니이제 도주식하면 매매 아프네요 \n","1 집에 돌아와 보이꺼네 문이 열려 있고 뼈다지가 열어둔 돈 전부 없어지던 어이떼 \n","2 아들 오늘 중요한 시험 보니까에 이 생엿 하고 사가꼬 먹고 힘내서 시험 잘 봐레이 \n","3 옛날부터 조상꿈이나 돼지꿈 꾸만 집에 돈 많이 들어온다고 좋아 해지로 \n","4 게얼에 먹을 채소나 과일 같은 것은 어데 보관을 했습니꺼 \n","5 촌구숙이라 젊은 사람들은 함부레 없고 전부 노인들만 있으이꺼네 농사 짓기가 힘들어요 \n","6 촌구석이라 젊은 사람들은 한 번이 없고 전부 노인들만 있으니까네 농사 짓기가 힘들어요 \n","7 소도 사람맨치로 잘 먹어야 근육도 붙고 심도 생겨서 일을 잘 하지로 \n","8 소도 사람 맨치로 잘 먹어야 근육도 붇고 힘도 생겨서 일을 잘 하지요 \n","9 옷가심을 짜를 때는 미리 선을 끟어 놓아야 쪽바리 잘 자를 수 있어예 \n","\n"," tar \n","0 오랫동안 한 동네에서 살았던 할머니인데 돌아가겨서 마음이 아프네요 \n","1 집에 돌아와 보니까 문이 열려 있고 서랍이 열어둔 돈 전부 없어지던 어이떼 \n","2 아들 오늘 중요한 시험 보니까 이 생 엿 하고 사서 먹고 힘내서 시험 잘 봐 \n","3 옛날부터 조상꿈이나 돼지꿈 꾸면 집에 돈 많이 들어온다고 좋아 했죠 \n","4 겨울에 먹을 채소나 과일 같은 것은 어디에 보관을 했습니까 \n","5 촌구석이라 젊은 사람들은 아예 없고 전부 노인들만 있으니까 농사 짓기가 힘들어요 \n","6 촌구석이라 젊은 사람들은 한 번이 없고 전부 노인들만 있으니까 농사 짓기가 힘들어요 \n","7 소도 사람처럼 잘 먹어야 근육도 붙고 힘도 생겨서 일을 잘 하지요 \n","8 소도 사람 처럼 잘 먹어야 근육도 붇고 힘도 생겨서 일을 잘 하지요 \n","9 옷감을 자를 때는 미리 선을 그어 놓아야 똑바로 잘 자를 수 있어요 "]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["# 검증 데이터 중에서 겹치는 표준어 문장과 방언 문장 제거\n","filtered_data_VL = {\n"," \"src\": [],\n"," \"tar\": []\n","}\n","\n","for i in range(0, len(dialect_sentences_VL)):\n"," if (standard_sentences_VL[i] != dialect_sentences_VL[i]):\n"," filtered_data_VL[\"src\"].append(dialect_sentences_VL[i])\n"," filtered_data_VL[\"tar\"].append(standard_sentences_VL[i])\n","\n","filtered_df_VL = pd.DataFrame(filtered_data_VL)\n","\n","filtered_df_VL[:10]"]},{"cell_type":"code","execution_count":10,"metadata":{"executionInfo":{"elapsed":380,"status":"ok","timestamp":1718494194081,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"OFCJmuqdOo6m"},"outputs":[],"source":["import matplotlib\n","import matplotlib.pyplot as plt\n","\n","# 문장 길이 계산\n","def sentenceLengths(sentences):\n"," return [len(sentence.split(' ')) for sentence in sentences]"]},{"cell_type":"code","execution_count":11,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"executionInfo":{"elapsed":1573,"status":"ok","timestamp":1718494196036,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"-zqR5FSPpN3X","outputId":"fbc12c4c-ebd7-4f52-fbf9-b41a304db22e"},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.hist(sentenceLengths(filtered_data_TR['src']), bins=10)\n","plt.xlabel('length of dialect')\n","plt.ylabel('number of dialect')\n","plt.show()"]},{"cell_type":"code","execution_count":12,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"executionInfo":{"elapsed":1531,"status":"ok","timestamp":1718494199237,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"wET-0eUhp2Vv","outputId":"f1ac0792-d2d2-4adc-c037-67d6dadac00c"},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.hist(sentenceLengths(filtered_data_TR['tar']), bins=10)\n","plt.xlabel('length of standard')\n","plt.ylabel('number of standard')\n","plt.show()"]},{"cell_type":"code","execution_count":13,"metadata":{"executionInfo":{"elapsed":1,"status":"ok","timestamp":1718494200604,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"SqMQxZO4p1TQ"},"outputs":[],"source":["def threshold_len_max(max_len, data):\n"," data = list(data) # 제네레이터를 리스트로 변환\n"," sentence_count = 0\n"," for sentence in data:\n"," if len(sentence) <= max_len:\n"," sentence_count += 1\n"," return sentence_count / len(data) * 100\n","\n","def threshold_len_min(min_len, data):\n"," data = list(data) # 제네레이터를 리스트로 변환\n"," sentence_count = 0\n"," for sentence in data:\n"," if len(sentence) >= min_len:\n"," sentence_count += 1\n"," return sentence_count / len(data) * 100"]},{"cell_type":"code","execution_count":16,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":388,"status":"ok","timestamp":1718494281610,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"cRQ_fdfSE0Rb","outputId":"70a706b3-c7c8-4a40-d115-d574ca9fd639"},"outputs":[{"data":{"text/plain":["211878"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["len(filtered_data_TR['src'])"]},{"cell_type":"code","execution_count":15,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1880,"status":"ok","timestamp":1718494246363,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"Ali5lQXaqSf0","outputId":"8f0fa62d-a26f-435c-ab7a-5833ee33c8ef"},"outputs":[{"name":"stdout","output_type":"stream","text":["dialect 중 22 이하인 비율은 80.23060440442141\n","standard 중 22 이하인 비율은 80.11355591425254\n"]}],"source":["max_len = 22\n","dialect_max = threshold_len_max(max_len, (sentence.split(' ') for sentence in filtered_data_TR['src']))\n","standard_max = threshold_len_max(max_len, (sentence.split(' ') for sentence in filtered_data_TR['tar']))\n","\n","print(f\"dialect 중 {max_len} 이하인 비율은 {dialect_max}\")\n","print(f\"standard 중 {max_len} 이하인 비율은 {standard_max}\")"]},{"cell_type":"code","execution_count":17,"metadata":{"executionInfo":{"elapsed":961,"status":"ok","timestamp":1718494286246,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"iLXOEUz2u45D"},"outputs":[],"source":["## 문장의 길이가 긴 것이 많아 80프로 정도의 데이터만 남김\n","\n","d_filter_indices = [i for i, sentence in enumerate(sentence.split(' ') for sentence in filtered_data_TR['src']) if len(sentence) <= max_len ]\n","s_filter_indices = [i for i, sentence in enumerate(sentence.split(' ') for sentence in filtered_data_TR['tar']) if len(sentence) <= max_len ]"]},{"cell_type":"code","execution_count":18,"metadata":{"executionInfo":{"elapsed":438,"status":"ok","timestamp":1718494288909,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"aV630gtgwMDM"},"outputs":[],"source":["indices = list(set(d_filter_indices) & set(s_filter_indices))"]},{"cell_type":"code","execution_count":20,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":360,"status":"ok","timestamp":1718494311539,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"U2I4XBAtPd_b","outputId":"68bf43ce-cc03-4fb3-a21a-56907b430c3e"},"outputs":[{"data":{"text/plain":["169723"]},"execution_count":20,"metadata":{},"output_type":"execute_result"}],"source":["len(indices)"]},{"cell_type":"code","execution_count":22,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":274354,"status":"ok","timestamp":1718494654867,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"sgHzuIzqEtlY","outputId":"a8e285ed-7213-4584-e1aa-f373ea9b6d50"},"outputs":[{"name":"stderr","output_type":"stream","text":["100%|██████████| 211878/211878 [04:34<00:00, 772.53it/s]\n"]}],"source":["import tqdm\n","\n","filtered_dialect = []\n","filtered_standard = []\n","\n","for i in tqdm.tqdm(range(len(filtered_data_TR['src']))):\n"," if i in indices:\n"," filtered_dialect.append(filtered_data_TR['src'][i])\n"," filtered_standard.append(filtered_data_TR['tar'][i])"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["import pickle\n","\n","# 전처리된 데이터 피클 파일로 저장\n","with open('/content/drive/MyDrive/LSTM+attention/filtered_dialect.pkl', 'wb') as f:\n"," pickle.dump(filtered_dialect, f)\n","\n","with open('/content/drive/MyDrive/LSTM+attention/filtered_standard.pkl', 'wb') as f:\n"," pickle.dump(filtered_standard, f)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# pickle 파일로부터 데이터를 불러옴\n","with open('filtered_dialect.pkl', 'rb') as f:\n"," loaded_filtered_dialect = pickle.load(f)\n","\n","with open('filtered_standard.pkl', 'rb') as f:\n"," loaded_filtered_standard = pickle.load(f)"]},{"cell_type":"code","execution_count":24,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":377,"status":"ok","timestamp":1718494783794,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"_dUEMZ8HRPow","outputId":"6a3257cd-787f-41bd-f3c4-4dd83a05bb9d"},"outputs":[{"name":"stdout","output_type":"stream","text":["169723\n","169723\n"]}],"source":["print(len(filtered_dialect))\n","print(len(filtered_standard))"]},{"cell_type":"code","execution_count":26,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"executionInfo":{"elapsed":1812,"status":"ok","timestamp":1718494937039,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"yf8viS-nR3bN","outputId":"653185d4-113e-4fcd-89fb-cb995c7f2676"},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.hist(sentenceLengths(filtered_dialect), bins=10)\n","plt.xlabel('length of dialect')\n","plt.ylabel('number of dialect')\n","plt.show()"]},{"cell_type":"code","execution_count":27,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":449},"executionInfo":{"elapsed":1312,"status":"ok","timestamp":1718494938349,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"2g430fC7RliO","outputId":"48af15ea-d43b-4dff-ed04-074c70c52752"},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.hist(sentenceLengths(filtered_standard), bins=10)\n","plt.xlabel('length of standard')\n","plt.ylabel('number of standard')\n","plt.show()"]},{"cell_type":"code","execution_count":28,"metadata":{"executionInfo":{"elapsed":411,"status":"ok","timestamp":1718495009410,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"oMl0xGNU49XX"},"outputs":[],"source":["SOS_token = 0\n","EOS_token = 0\n","\n","class Lang:\n"," def __init__(self, name):\n"," self.name = name\n"," self.word2index = {}\n"," self.word2count = {}\n"," self.index2word = {0: \"SOS\", 1: \"EOS\"}\n"," self.n_words = 2 # SOS, EOS\n","\n"," def addSentence(self, sentence):\n"," for word in sentence.split(\" \"):\n"," self.addWord(word)\n","\n"," def addWord(self, word):\n"," if word not in self.word2index:\n"," self.word2index[word] = self.n_words\n"," self.word2count[word] = 1\n"," self.index2word[self.n_words] = word\n"," self.n_words += 1\n"," else:\n"," self.word2count[word] += 1"]},{"cell_type":"code","execution_count":30,"metadata":{"executionInfo":{"elapsed":5024,"status":"ok","timestamp":1718495461417,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"VBPYjCbZ8l6k"},"outputs":[],"source":["# Lang 객체 생성\n","dialect_lang = Lang(\"Dialect\")\n","standard_lang = Lang(\"Standard\")\n","\n","# 문장 추가\n","for sentence in filtered_dialect:\n"," dialect_lang.addSentence(sentence)\n","for sentence in filtered_standard:\n"," standard_lang.addSentence(sentence)\n","for sentence in filtered_df_VL['src']:\n"," dialect_lang.addSentence(sentence)\n","for sentence in filtered_df_VL['tar']:\n"," standard_lang.addSentence(sentence)\n","\n","# 문장\n","pairs = list(zip(filtered_dialect, filtered_standard))\n","VL_pairs = list(zip(filtered_df_VL['src'], filtered_df_VL['tar']))\n","\n","# 문장을 인덱스로 변환\n","def indexesFromSentence(lang, sentence):\n"," return [lang.word2index[word] for word in sentence.split(' ')]\n","\n","def tensorFromSentence(lang, sentence):\n"," indexes = indexesFromSentence(lang, sentence)\n"," indexes.append(EOS_token)\n"," if len(indexes) < max_len:\n"," indexes.extend([EOS_token] * (max_len - len(indexes))) # 패딩 추가\n"," return torch.tensor(indexes[:max_len], dtype=torch.long).view(-1, 1)\n","\n","def tensorsFromPair(pair):\n"," input_tensor = tensorFromSentence(dialect_lang, pair[0])\n"," target_tensor = tensorFromSentence(standard_lang, pair[1])\n"," return (input_tensor, target_tensor)"]},{"cell_type":"code","execution_count":31,"metadata":{"executionInfo":{"elapsed":6681,"status":"ok","timestamp":1718495480263,"user":{"displayName":"김범진","userId":"02150140531333380287"},"user_tz":-540},"id":"EyqODVGn87BL"},"outputs":[],"source":["import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","\n","# 검증 데이터를 인덱스로 변환\n","validation_input_tensors = [tensorFromSentence(dialect_lang, pair[0]) for pair in VL_pairs]\n","validation_target_tensors = [tensorFromSentence(standard_lang, pair[1]) for pair in VL_pairs]\n","\n","class EncoderRNN(nn.Module):\n"," def __init__(self, input_size, hidden_size):\n"," super(EncoderRNN, self).__init__()\n"," self.hidden_size = hidden_size\n"," self.embedding = nn.Embedding(input_size, hidden_size)\n"," self.lstm = nn.LSTM(hidden_size, hidden_size)\n","\n"," def forward(self, input, hidden):\n"," embedded = self.embedding(input).view(1, 1, -1)\n"," output, hidden = self.lstm(embedded, hidden)\n"," return output, hidden\n","\n"," def initHidden(self):\n"," return (torch.zeros(1, 1, self.hidden_size),\n"," torch.zeros(1, 1, self.hidden_size))\n","\n","class AttnDecoderRNN(nn.Module):\n"," def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=max_len):\n"," super(AttnDecoderRNN, self).__init__()\n"," self.hidden_size = hidden_size\n"," self.output_size = output_size\n"," self.dropout_p = dropout_p\n"," self.max_length = max_length\n","\n"," self.embedding = nn.Embedding(self.output_size, self.hidden_size)\n"," self.attn = nn.Linear(self.hidden_size * 2, self.max_length)\n"," self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)\n"," self.dropout = nn.Dropout(self.dropout_p)\n"," self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)\n"," self.out = nn.Linear(self.hidden_size, self.output_size)\n","\n"," def forward(self, input, hidden, encoder_outputs):\n"," embedded = self.embedding(input).view(1, 1, -1)\n"," embedded = self.dropout(embedded)\n","\n"," attn_weights = nn.functional.softmax(\n"," self.attn(torch.cat((embedded[0], hidden[0][0]), 1)), dim=1)\n"," attn_applied = torch.bmm(attn_weights.unsqueeze(0),\n"," encoder_outputs.unsqueeze(0))\n","\n"," output = torch.cat((embedded[0], attn_applied[0]), 1)\n"," output = self.attn_combine(output).unsqueeze(0)\n","\n"," output = nn.functional.relu(output)\n"," output, hidden = self.lstm(output, hidden)\n","\n"," output = nn.functional.log_softmax(self.out(output[0]), dim=1)\n"," return output, hidden, attn_weights\n","\n"," def initHidden(self):\n"," return (torch.zeros(1, 1, self.hidden_size),\n"," torch.zeros(1, 1, self.hidden_size))"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Uaozw3dc_vdk"},"outputs":[],"source":["import random\n","import time\n","import math\n","\n","def asMinutes(s):\n"," m = math.floor(s / 60)\n"," s -= m * 60\n"," return f'{m}m {s:.2f}s'\n","\n","def timeSince(since, percent):\n"," now = time.time()\n"," s = now - since\n"," es = s / (percent)\n"," rs = es - s\n"," return f'{asMinutes(s)} (- {asMinutes(rs)})'\n","\n","def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=max_len):\n"," encoder_hidden = encoder.initHidden()\n","\n"," encoder_optimizer.zero_grad()\n"," decoder_optimizer.zero_grad()\n","\n"," input_length = input_tensor.size(0)\n"," target_length = target_tensor.size(0)\n","\n"," encoder_outputs = torch.zeros(max_length, encoder.hidden_size)\n","\n"," loss = 0\n","\n"," for ei in range(input_length):\n"," encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)\n"," encoder_outputs[ei] = encoder_output[0, 0]\n","\n"," decoder_input = torch.tensor([[SOS_token]])\n","\n"," decoder_hidden = encoder_hidden\n","\n"," for di in range(target_length):\n"," decoder_output, decoder_hidden, decoder_attention = decoder(\n"," decoder_input, decoder_hidden, encoder_outputs)\n"," topv, topi = decoder_output.topk(1)\n"," decoder_input = topi.squeeze().detach() # 다음 입력으로 사용\n","\n"," loss += criterion(decoder_output, target_tensor[di])\n"," if decoder_input.item() == EOS_token:\n"," break\n","\n"," loss.backward()\n","\n"," encoder_optimizer.step()\n"," decoder_optimizer.step()\n","\n"," return loss.item() / target_length\n","\n","def trainIters(encoder, decoder, n_iters, print_every=1000, learning_rate=0.01):\n"," start = time.time()\n"," plot_losses = []\n"," print_loss_total = 0\n"," plot_loss_total = 0\n","\n"," encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)\n"," decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)\n"," training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]\n"," criterion = nn.NLLLoss()\n","\n"," for iter in range(1, n_iters + 1):\n"," training_pair = training_pairs[iter - 1]\n"," input_tensor = training_pair[0]\n"," target_tensor = training_pair[1]\n","\n"," loss = train(input_tensor, target_tensor, encoder,\n"," decoder, encoder_optimizer, decoder_optimizer, criterion)\n"," print_loss_total += loss\n"," plot_loss_total += loss\n","\n"," if iter % print_every == 0:\n"," print_loss_avg = print_loss_total / print_every\n"," print_loss_total = 0\n"," print(f'{timeSince(start, iter / n_iters)} ({iter} {iter / n_iters * 100:.2f}%) {print_loss_avg:.4f}')\n","\n"," if iter % plot_every == 0:\n"," plot_loss_avg = plot_loss_total / plot_every\n"," plot_losses.append(plot_loss_avg)\n"," plot_loss_total = 0\n","\n"," showPlot(plot_losses)\n","\n","def showPlot(points):\n"," plt.figure()\n"," plt.plot(points)\n"," plt.title('Training Loss')\n"," plt.xlabel('Iterations')\n"," plt.ylabel('Loss')\n"," plt.show()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"JLgmcaB5UKtN"},"outputs":[],"source":["# 모델 초기화 및 훈련\n","hidden_size = 256\n","encoder = EncoderRNN(dialect_lang.n_words, hidden_size)\n","decoder = AttnDecoderRNN(hidden_size, standard_lang.n_words, dropout_p=0.1)\n","\n","trainIters(encoder, decoder, 75000, print_every=5000, plot_every=500)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"_Gjpck4MUHBm"},"outputs":[],"source":["def saveModel(encoder, decoder, encoder_path='encoder.pth', decoder_path='decoder.pth'): ## 모델 저장\n"," torch.save(encoder.state_dict(), encoder_path)\n"," torch.save(decoder.state_dict(), decoder_path)\n","\n","def loadModel(encoder_path='encoder.pth', decoder_path='decoder.pth'): ## 모델 로드\n"," encoder = EncoderRNN(dialect_lang.n_words, hidden_size)\n"," decoder = AttnDecoderRNN(hidden_size, standard_lang.n_words, dropout_p=0.1)\n"," encoder.load_state_dict(torch.load(encoder_path))\n"," decoder.load_state_dict(torch.load(decoder_path))\n"," return encoder, decoder\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"zYySN_5AUvbG"},"outputs":[],"source":["# 테스트 함수\n","\n","def evaluate(encoder, decoder, sentence, max_length=max_len):\n"," with torch.no_grad():\n"," input_tensor = tensorFromSentence(dialect_lang, sentence)\n"," input_length = input_tensor.size()[0]\n"," encoder_hidden = encoder.initHidden()\n","\n"," encoder_outputs = torch.zeros(max_length, encoder.hidden_size)\n","\n"," for ei in range(input_length):\n"," encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)\n"," encoder_outputs[ei] = encoder_output[0, 0]\n","\n"," decoder_input = torch.tensor([[SOS_token]]) # SOS token\n"," decoder_hidden = encoder_hidden\n","\n"," decoded_words = []\n"," decoder_attentions = torch.zeros(max_length, max_length)\n","\n"," for di in range(max_length):\n"," decoder_output, decoder_hidden, decoder_attention = decoder(\n"," decoder_input, decoder_hidden, encoder_outputs)\n"," decoder_attentions[di] = decoder_attention.data\n"," topv, topi = decoder_output.data.topk(1)\n"," if topi.item() == EOS_token:\n"," decoded_words.append('')\n"," break\n"," else:\n"," decoded_words.append(standard_lang.index2word[topi.item()])\n","\n"," decoder_input = topi.squeeze().detach()\n","\n"," return decoded_words\n","\n","def evaluateRandomly(encoder, decoder, n=10):\n"," for i in range(n):\n"," pair = random.choice(test_pairs)\n"," print('Dialect:', pair[0])\n"," print('Expected:', pair[1])\n"," output_words = evaluate(encoder, decoder, pair[0])\n"," output_sentence = ' '.join(output_words)\n"," print('Predicted:', output_sentence)\n"," print('')\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ch8xAa69U5DA"},"outputs":[],"source":["## 테스트 데이터 준비 필요\n","test_dialect_sentences = []\n","test_standard_sentences = []\n","\n","test_pairs = list(zip(test_dialect_sentences, test_standard_sentences))"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"JQNbhsGTVRCe"},"outputs":[],"source":["## 테스트 함수 실행\n","evaluateRandomly(encoder, decoder, n=len(test_pairs))"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyMqmEZhz6TVWEQMeTAdUpiJ","provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}