Skip to content

Commit

Permalink
fix: 토큰화할때 품사도 같이 저장
Browse files Browse the repository at this point in the history
  • Loading branch information
edcrfv458 committed May 12, 2024
1 parent 5627e42 commit fa559f0
Showing 1 changed file with 52 additions and 30 deletions.
82 changes: 52 additions & 30 deletions AI/사전 생성.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -28,7 +28,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand All @@ -46,7 +46,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -63,14 +63,21 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 367955/367955 [19:44<00:00, 310.58it/s]\n"
" 0%| | 0/367955 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 367955/367955 [40:44<00:00, 150.55it/s] \n"
]
}
],
Expand All @@ -79,27 +86,40 @@
"standard_okt = []\n",
"dialect_okt = []\n",
"\n",
"stop_words = ['이', '에', '는', '가', '도', '을', '뭐', '은','하고','게','에는','그', '를', '것', '으로','로']\n",
"# stop_words = ['이', '에', '는', '가', '도', '을', '뭐', '은','하고','게','에는','그', '를', '것', '으로','로']\n",
"\n",
"for i in tqdm(range(0, len(all_sentences))):\n",
" standard_tokens = [token for token in okt.morphs(standard_sentences[i]) if token not in stop_words]\n",
" dialect_tokens = [token for token in okt.morphs(dialect_sentences[i]) if token not in stop_words]\n",
" standard_tokens = [(token, pos) for token, pos in okt.pos(standard_sentences[i])]\n",
" dialect_tokens = [(token, pos) for token, pos in okt.pos(dialect_sentences[i])]\n",
" standard_okt.append(standard_tokens)\n",
" dialect_okt.append(dialect_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['내일', '잔치', '있어서', '날', '많이', '추우면', '안', '될텐데', '내일', '많이', '춥다', '하더냐']"
"[('내일', 'Noun'),\n",
" ('잔치', 'Noun'),\n",
" ('가', 'Josa'),\n",
" ('있어서', 'Adjective'),\n",
" ('날', 'Noun'),\n",
" ('이', 'Josa'),\n",
" ('많이', 'Adverb'),\n",
" ('추우면', 'Verb'),\n",
" ('안', 'Noun'),\n",
" ('될텐데', 'Verb'),\n",
" ('내일', 'Noun'),\n",
" ('많이', 'Adverb'),\n",
" ('춥다', 'Noun'),\n",
" ('하더냐', 'Verb')]"
]
},
"execution_count": 14,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -110,29 +130,31 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['내일',\n",
" '잔치',\n",
" '있어가',\n",
" '날',\n",
" '마이',\n",
" '추',\n",
" '우마',\n",
" '안',\n",
" '델낀디',\n",
" '내일',\n",
" '많이',\n",
" '춥다',\n",
" '카더',\n",
" '나']"
"[('내일', 'Noun'),\n",
" ('잔치', 'Noun'),\n",
" ('가', 'Josa'),\n",
" ('있어가', 'Adjective'),\n",
" ('날', 'Noun'),\n",
" ('이', 'Josa'),\n",
" ('마이', 'Noun'),\n",
" ('추', 'Noun'),\n",
" ('우마', 'Noun'),\n",
" ('안', 'Noun'),\n",
" ('델낀디', 'Noun'),\n",
" ('내일', 'Noun'),\n",
" ('많이', 'Adverb'),\n",
" ('춥다', 'Noun'),\n",
" ('카더', 'Noun'),\n",
" ('나', 'Josa')]"
]
},
"execution_count": 15,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -143,16 +165,16 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# 토큰화된 방언 문장과 표준어 문장을 각각 csv파일로 저장\n",
"with open('st_stopwords_okt_all.csv', 'w', encoding='utf-8', newline='') as f:\n",
"with open('standard_okt_data.csv', 'w', encoding='utf-8', newline='') as f:\n",
" writer = csv.writer(f)\n",
" writer.writerows(standard_okt)\n",
"\n",
"with open('di_stopwords_okt_all.csv', 'w', encoding='utf-8', newline='') as f:\n",
"with open('dialect_okt_data.csv', 'w', encoding='utf-8', newline='') as f:\n",
" writer = csv.writer(f)\n",
" writer.writerows(dialect_okt)"
]
Expand Down

0 comments on commit fa559f0

Please sign in to comment.