Skip to content

Commit

Permalink
Merge pull request #318 from BrikerMan/develop
Browse files Browse the repository at this point in the history
Release v1.1.0
  • Loading branch information
BrikerMan authored Dec 27, 2019
2 parents 71646ae + 19ba32d commit 9d72b7b
Show file tree
Hide file tree
Showing 33 changed files with 925 additions and 192 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -110,4 +110,5 @@ venv.bak/
.vscode
venv-tf/*
.pytype/
mkdocs/site
mkdocs/site
node_modules
8 changes: 7 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ env:
global:
- COVERALLS_PARALLEL=true
matrix:
# Scoring
- TEST_FILE=tests/scoring
# Labeling
- TEST_FILE=tests/labeling/
# classification part 1
Expand All @@ -17,6 +19,8 @@ env:
- TEST_FILE=tests/test_custom_multi_output_classification.py
# Embedding
- TEST_FILE=tests/embedding/
# Tokenizer
- TEST_FILE=tests/test_tokenizer.py

python:
- "3.6"
Expand All @@ -43,6 +47,7 @@ install:
- pip install nose
- python -c "import kashgari;print(f'kashgari version {kashgari.__version__}')"
- git fetch --unshallow --quiet
- export PYTHONPATH=`pwd`

script: nosetests --with-coverage --cover-html --cover-html-dir=htmlcov
--cover-xml --cover-xml-file=coverage.xml --with-xunit
Expand All @@ -67,8 +72,9 @@ jobs:
- stage: Document
python: "3.6"
install:
- echo -e "machine github.com\n login ${GITHUB_TOKEN}" > ~/.netrc
- echo -e "machine github.com\n login ${GITHUB_TOKEN}" > ~/.netrc
- pip install mkdocs mkdocs-material pymdown-extensions
script:
- cp README.md mkdocs/docs/index.md
- cd mkdocs
- mkdocs gh-deploy --force --clean
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ Here is a set of quick tutorials to get you started with the library:

- [Tutorial 1: Text Classification](https://kashgari.bmio.net/tutorial/text-classification/)
- [Tutorial 2: Text Labeling](https://kashgari.bmio.net/tutorial/text-labeling/)
- [Tutorial 3: Language Embedding](https://kashgari.bmio.net/embeddings/)
- [Tutorial 3: Text Scoring](https://kashgari.bmio.net/tutorial/text-scoring/)
- [Tutorial 4: Language Embedding](https://kashgari.bmio.net/embeddings/)

There are also articles and posts that illustrate how to use Kashgari:

Expand Down
5 changes: 3 additions & 2 deletions kashgari/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
custom_objects = keras_bert.get_custom_objects()
CLASSIFICATION = TaskType.CLASSIFICATION
LABELING = TaskType.LABELING
SCORING = TaskType.SCORING

from kashgari.version import __version__

Expand All @@ -35,6 +36,6 @@
from kashgari import utils
from kashgari import callbacks

from kashgari import migeration
from kashgari import migration

migeration.show_migration_guide()
migration.show_migration_guide()
6 changes: 4 additions & 2 deletions kashgari/embeddings/base_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from tensorflow import keras

import kashgari
from kashgari.processors import ClassificationProcessor, LabelingProcessor
from kashgari.processors import ClassificationProcessor, LabelingProcessor, ScoringProcessor
from kashgari.processors.base_processor import BaseProcessor

L = keras.layers
Expand Down Expand Up @@ -74,8 +74,10 @@ def __init__(self,
self.processor = ClassificationProcessor()
elif task == kashgari.LABELING:
self.processor = LabelingProcessor()
elif task == kashgari.SCORING:
self.processor = ScoringProcessor()
else:
raise ValueError()
raise ValueError('Need to set the processor param, value: {labeling, classification, scoring}')
else:
self.processor = processor

Expand Down
2 changes: 0 additions & 2 deletions kashgari/embeddings/gpt_2_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@
import numpy as np
import kashgari
import pathlib
import tensorflow as tf
from tensorflow.python.keras.utils import get_file
from kashgari.layers import NonMaskingLayer, L
from kashgari.embeddings.base_embedding import Embedding
from kashgari.processors.base_processor import BaseProcessor
import keras_gpt_2 as gpt2
Expand Down
4 changes: 2 additions & 2 deletions kashgari/layers/att_wgt_avg_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
# file: attention_weighted_average.py
# time: 2019-06-24 19:35

import kashgari
import tensorflow as tf
from tensorflow.python import keras
from tensorflow.python.keras import backend as K

import kashgari

L = keras.layers
initializers = keras.initializers
InputSpec = L.InputSpec
Expand Down
1 change: 1 addition & 0 deletions kashgari/macros.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
class TaskType(object):
CLASSIFICATION = 'classification'
LABELING = 'labeling'
SCORING = 'scoring'


class Config(object):
Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions kashgari/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@

from kashgari.processors.classification_processor import ClassificationProcessor
from kashgari.processors.labeling_processor import LabelingProcessor
from kashgari.processors.scoring_processor import ScoringProcessor
8 changes: 3 additions & 5 deletions kashgari/processors/base_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,15 @@
# file: base_processor.py
# time: 2019-05-21 11:27

import os
import json
import collections
import logging
import pathlib
import operator
import collections
from typing import List, Optional, Union, Dict, Any

import numpy as np
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from kashgari import utils
import numpy as np


class BaseProcessor(object):
Expand Down
6 changes: 5 additions & 1 deletion kashgari/processors/classification_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ class ClassificationProcessor(BaseProcessor):
def __init__(self, multi_label=False, **kwargs):
super(ClassificationProcessor, self).__init__(**kwargs)
self.multi_label = multi_label
self.multi_label_binarizer: MultiLabelBinarizer = None
if self.label2idx:
self.multi_label_binarizer: MultiLabelBinarizer = MultiLabelBinarizer(classes=list(self.label2idx.keys()))
self.multi_label_binarizer.fit([])
else:
self.multi_label_binarizer: MultiLabelBinarizer = None

def info(self):
info = super(ClassificationProcessor, self).info()
Expand Down
100 changes: 100 additions & 0 deletions kashgari/processors/scoring_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# encoding: utf-8

# author: BrikerMan
# contact: eliyar917@gmail.com
# blog: https://eliyar.biz

# file: scoring_processor.py
# time: 11:10 上午

from typing import List, Optional

import numpy as np

import kashgari
from kashgari import utils
from kashgari.processors.base_processor import BaseProcessor


def is_numeric(obj):
attrs = ['__add__', '__sub__', '__mul__', '__truediv__', '__pow__']
return all(hasattr(obj, attr) for attr in attrs)


class ScoringProcessor(BaseProcessor):
"""
Corpus Pre Processor class
"""

def __init__(self, output_dim=None, **kwargs):
super(ScoringProcessor, self).__init__(**kwargs)
self.output_dim = output_dim

def info(self):
info = super(ScoringProcessor, self).info()
info['task'] = kashgari.SCORING
return info

def _build_label_dict(self,
label_list: List[List[float]]):
"""
Build label2idx dict for sequence labeling task
Args:
label_list: corpus label list
"""
if self.output_dim is None:
label_sample = label_list[0]
if isinstance(label_sample, np.ndarray) and len(label_sample.shape) == 1:
self.output_dim = label_sample.shape[0]
elif is_numeric(label_sample):
self.output_dim = 1
elif isinstance(label_sample, list):
self.output_dim = len(label_sample)
else:
raise ValueError('Scoring Label Sample must be a float, float array or 1D numpy array')
# np_labels = np.array(label_list)
# if np_labels.max() > 1 or np_labels.min() < 0:
# raise ValueError('Scoring Label Sample must be in range[0,1]')

def process_y_dataset(self,
data: List[List[str]],
max_len: Optional[int] = None,
subset: Optional[List[int]] = None) -> np.ndarray:
if subset is not None:
target = utils.get_list_subset(data, subset)
else:
target = data[:]
y = np.array(target)
return y

def numerize_token_sequences(self,
sequences: List[List[str]]):

result = []
for seq in sequences:
if self.add_bos_eos:
seq = [self.token_bos] + seq + [self.token_eos]
unk_index = self.token2idx[self.token_unk]
result.append([self.token2idx.get(token, unk_index) for token in seq])
return result

def numerize_label_sequences(self,
sequences: List[List[str]]) -> List[List[int]]:
return sequences

def reverse_numerize_label_sequences(self,
sequences,
lengths=None):
return sequences


if __name__ == "__main__":
from kashgari.corpus import SMP2018ECDTCorpus

x, y = SMP2018ECDTCorpus.load_data()
x = x[:3]
y = [0.2, 0.3, 0.2]
p = ScoringProcessor()
p.analyze_corpus(x, y)
print(p.process_y_dataset(y))
12 changes: 8 additions & 4 deletions kashgari/tasks/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,12 +414,16 @@ def predict(self,
lengths = [len(sen) for sen in x_data]
tensor = self.embedding.process_x_dataset(x_data)
pred = self.tf_model.predict(tensor, batch_size=batch_size, **predict_kwargs)
res = self.embedding.reverse_numerize_label_sequences(pred.argmax(-1),
if self.task == 'scoring':
t_pred = pred
else:
t_pred = pred.argmax(-1)
res = self.embedding.reverse_numerize_label_sequences(t_pred,
lengths)
if debug_info:
logging.info('input: {}'.format(tensor))
logging.info('output: {}'.format(pred))
logging.info('output argmax: {}'.format(pred.argmax(-1)))
print('input: {}'.format(tensor))
print('output: {}'.format(pred))
print('output argmax: {}'.format(t_pred))
return res

def evaluate(self,
Expand Down
5 changes: 3 additions & 2 deletions kashgari/tasks/classification/dpcnn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@
# https://github.com/miracleyoo/DPCNN-TextCNN-Pytorch-Inception
# https://www.kaggle.com/michaelsnell/conv1d-dpcnn-in-keras

import logging
from math import log2, floor
import tensorflow as tf
from typing import Dict, Any

import tensorflow as tf

from kashgari.layers import L, KMaxPoolingLayer
from kashgari.tasks.classification.base_model import BaseClassificationModel

Expand Down
14 changes: 14 additions & 0 deletions kashgari/tasks/scoring/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# encoding: utf-8

# author: BrikerMan
# contact: eliyar917@gmail.com
# blog: https://eliyar.biz

# file: __init__.py
# time: 11:36 上午


from kashgari.tasks.scoring.models import BiLSTM_Model

if __name__ == "__main__":
pass
Loading

0 comments on commit 9d72b7b

Please sign in to comment.