-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCRF_disease.py
52 lines (49 loc) · 1.25 KB
/
CRF_disease.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from nltk.corpus import PlaintextCorpusReader
import nltk
import textblob
import re
import pandas as pd
cr="C:\\Users\\dell\\PycharmProjects\\LBD_test\\corpus"
pmids=[]
text=[]
# with open(cr+"\\NCBI_corpus_training.txt","r") as fp:
# strings=fp.readlines()
# for i in strings:
# string_split=i.split('\t')
# pmids.append(string_split[0])
# string_split.pop(0)
# text.append(' '.join(string_split))
#
# mydf=pd.DataFrame()
# mydf['pmid']=pmids
# mydf['text']=text
# sents_no=[]
# pmids=[]
# i=0
# for index,row in mydf.iterrows():
# stence=row['text']
# sents=stence.split('.')
# newsents=[]
# for sent in sents:
# newsents.append(sent+'.')
# i=i+1
# sents_no.append(i)
# pmids.append(row['pmid'])
#
# # result= re.findall(r"<category=\".`+?\">(.+?)</category>",stence, re.S)
# # print(result)
# print(sents_no)
# print(pmids)
str='you are my shine.'
str_list=list(str)
list=[]
for i in str_list:
list.append({'str':str,"obj":"ss"})
str2=" ".join(list)
print(str2 )
wodslist = PlaintextCorpusReader(cr, '.*')
for i in wodslist.sents('NCBI_corpus_training.txt'):
text=nltk.word_tokenize(' '.join(i))
nltk.wordpunct_tokenize
print(i)
# print(nltk.pos_tag(text, tagset='universal'))