diff --git a/README.md b/README.md index 9ef2265..ca4fb24 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ -whisper_normalizer -================ +# whisper_normalizer @@ -37,6 +36,12 @@ as normalization step when evaluating competitive models like [AssemblyAI Conformer-1 model](https://www.assemblyai.com/blog/conformer-1/). +## Models evaluated using Whisper normalization + +- Massively Multilingual Speech (MMS) models by Meta +- Conformer 1 by AssemblyAI +- Conformer 2 by AssemblyAI + ## How to use OpenAI open source approach of text normalization/standardization is diff --git a/nbs/00_basic.ipynb b/nbs/00_basic.ipynb index 054c338..6c0ca70 100644 --- a/nbs/00_basic.ipynb +++ b/nbs/00_basic.ipynb @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -150,7 +150,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -198,7 +198,7 @@ "Note: It's not recommended to use this function for non-english languages because it removes vowels in languages like Malayalam which was identified by [kavya in this tweet](https://twitter.com/kavya_manohar/status/1752574864618365059)." ] }, - "execution_count": 5, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -217,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -226,7 +226,7 @@ "'എന റ കമ പ യ ട ടറ ന എന റ ഭ ഷ'" ] }, - "execution_count": 6, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -238,7 +238,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -247,7 +247,7 @@ "'language is like a map that we use to navigate the world but it s also like a prison that keeps us from seeing what s beyond the walls but what if there was a way to break out of this prison to expand our map to explore new worlds with new words this is the possibility and the challenge offered by instruction tuned language models like gpt 4 a cutting edge technology that uses artificial neural networks to generate natural language texts based on user inputs gpt 4 can write anything from essays to novels to poems to tweets to code to recipes to jokes to lyrics to whatever you want it can even write things that don t exist yet things that no human has ever thought of or said before as wittgenstein s quote suggests language is a source of limitation and liberation gpt 4 pushes this idea to the extreme by giving us access to unlimited language this could be the most significant new technology in modern history because it has the potential to change many domains and industries from education to entertainment from journalism to justice from science to art these models could enable new forms of learning storytelling reporting reasoning discovery and creation they could also create new ethical social and cultural challenges that require careful reflection and regulation how we use this technology will depend on how we recognize its implications for ourselves and others this technology is a form of artificial intelligence the word intelligence derives from inter and legere to be intelligent then is to be able to choose between things to pick out what matters to read what is written intelligence is not just a quantity or a quality it is an activity a process a practice it is something that we do with our minds and our words but when we let gpt 4 do this for us are we not abdicating our intelligence are we not letting go of our ability to choose to pick out to read are we not becoming passive consumers of language instead of active producers '" ] }, - "execution_count": 7, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -274,7 +274,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -283,7 +283,7 @@ "'language is like a map that we use to navigate the world but it s also like a prison that keeps us from seeing what s beyond the walls but what if there was a way to break out of this prison to expand our map to explore new worlds with new words this is the possibility and the challenge offered by instruction tuned language models like gpt 4 a cutting edge technology that uses artificial neural networks to generate natural language texts based on user inputs gpt 4 can write anything from essays to novels to poems to tweets to code to recipes to jokes to lyrics to whatever you want it can even write things that don t exist yet things that no human has ever thought of or said before as wittgenstein s quote suggests language is a source of limitation and liberation gpt 4 pushes this idea to the extreme by giving us access to unlimited language this could be the most significant new technology in modern history because it has the potential to change many domains and industries from education to entertainment from journalism to justice from science to art these models could enable new forms of learning storytelling reporting reasoning discovery and creation they could also create new ethical social and cultural challenges that require careful reflection and regulation how we use this technology will depend on how we recognize its implications for ourselves and others this technology is a form of artificial intelligence the word intelligence derives from inter and legere to be intelligent then is to be able to choose between things to pick out what matters to read what is written intelligence is not just a quantity or a quality it is an activity a process a practice it is something that we do with our minds and our words but when we let gpt 4 do this for us are we not abdicating our intelligence are we not letting go of our ability to choose to pick out to read are we not becoming passive consumers of language instead of active producers '" ] }, - "execution_count": 8, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -312,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -330,21 +330,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" } }, "nbformat": 4, diff --git a/nbs/01_english.ipynb b/nbs/01_english.ipynb index c09ba29..aa093c6 100644 --- a/nbs/01_english.ipynb +++ b/nbs/01_english.ipynb @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -515,7 +515,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -538,7 +538,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -547,7 +547,7 @@ "'accessorize'" ] }, - "execution_count": 5, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -559,7 +559,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -671,7 +671,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -729,7 +729,7 @@ "12. Replace any successive whitespace characters with a space." ] }, - "execution_count": 7, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -748,7 +748,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -757,7 +757,7 @@ "'i am a little teapot short and stout tip me over and pour me out'" ] }, - "execution_count": 8, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -769,7 +769,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -778,7 +778,7 @@ "'language is like a map that we use to navigate the world but it s also like a prison that keeps us from seeing what s beyond the walls but what if there was a way to break out of this prison to expand our map to explore new worlds with new words this is the possibility and the challenge offered by instruction tuned language models like gpt 4 a cutting edge technology that uses artificial neural networks to generate natural language texts based on user inputs gpt 4 can write anything from essays to novels to poems to tweets to code to recipes to jokes to lyrics to whatever you want it can even write things that don t exist yet things that no human has ever thought of or said before as wittgenstein s quote suggests language is a source of limitation and liberation gpt 4 pushes this idea to the extreme by giving us access to unlimited language this could be the most significant new technology in modern history because it has the potential to change many domains and industries from education to entertainment from journalism to justice from science to art these models could enable new forms of learning storytelling reporting reasoning discovery and creation they could also create new ethical social and cultural challenges that require careful reflection and regulation how we use this technology will depend on how we recognize its implications for ourselves and others this technology is a form of artificial intelligence the word intelligence derives from inter and legere to be intelligent then is to be able to choose between things to pick out what matters to read what is written intelligence is not just a quantity or a quality it is an activity a process a practice it is something that we do with our minds and our words but when we let gpt 4 do this for us are we not abdicating our intelligence are we not letting go of our ability to choose to pick out to read are we not becoming passive consumers of language instead of active producers'" ] }, - "execution_count": 9, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -805,7 +805,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -823,21 +823,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" } }, "nbformat": 4, diff --git a/nbs/03_malayalam.ipynb b/nbs/03_malayalam.ipynb index 2963530..8399886 100644 --- a/nbs/03_malayalam.ipynb +++ b/nbs/03_malayalam.ipynb @@ -5,12 +5,18 @@ "id": "730b39f7-94eb-4496-999e-1a0c5a1ad9ea", "metadata": {}, "source": [ - "## Malayalam Text normalization" + "## Malayalam Text normalization\n", + "\n", + "An attempt to do normalization in Malayalam.\n", + "\n", + "TO READ:\n", + "\n", + "https://unicode.org/reports/tr15/" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "ccb040df-f503-4e5a-837d-525ea2fdafff", "metadata": {}, "outputs": [], @@ -20,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "9d57a998-688f-45db-aa8f-c4cfd73529ee", "metadata": {}, "outputs": [], @@ -31,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "7c6ba902-9364-4cba-aa0f-13606145ef5d", "metadata": {}, "outputs": [], @@ -45,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "f1a829d6-1f9d-4fdc-98f3-da5e7aa127b6", "metadata": {}, "outputs": [], @@ -84,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "51f96ad9-bcd4-4de7-a738-81208a9fe4a0", "metadata": {}, "outputs": [], @@ -94,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "7c57debb-c4a8-48eb-94b6-356a320877ae", "metadata": {}, "outputs": [ @@ -104,7 +110,7 @@ "'എന്റെ കമ്പ്യൂട്ടറിനു് എന്റെ ഭാഷ'" ] }, - "execution_count": 6, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -115,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "fc3203a4-e02f-4019-8f8d-f43d06348914", "metadata": {}, "outputs": [ @@ -125,7 +131,7 @@ "'കൊച്ചി കെഎസ്ഇബി മുൻ ചെയർമാനും സംസ്ഥാന വനംവകുപ്പ് മേധാവിയുമായിരുന്ന ടിഎം മനോഹരൻ അന്തരിച്ചു ശനിയാഴ്ച വെെകീട്ട് ന് കളമശ്ശേരിയിലെ വീട്ടിൽ'" ] }, - "execution_count": 7, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -136,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "7dc23a6e-6e53-4dbf-a12c-5d241ec6ed4a", "metadata": {}, "outputs": [], @@ -156,21 +162,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" } }, "nbformat": 4, diff --git a/nbs/index.ipynb b/nbs/index.ipynb index 343f8be..d3e7041 100644 --- a/nbs/index.ipynb +++ b/nbs/index.ipynb @@ -49,7 +49,13 @@ "## Why use this python package?\n", "\n", "This package is a python implementation of the text standardisation/normalization approach which is being used in OpenAI whisper text normalizer. If you want to use just text normalization alone, it's better to use this\n", - "instead reimplementing the same thing. OpenAI approach of text normalization is very helpful and is being used as normalization step when evaluating competitive models like [AssemblyAI Conformer-1 model](https://www.assemblyai.com/blog/conformer-1/)." + "instead reimplementing the same thing. OpenAI approach of text normalization is very helpful and is being used as normalization step when evaluating competitive models like [AssemblyAI Conformer-1 model](https://www.assemblyai.com/blog/conformer-1/).\n", + "\n", + "## Models evaluated using Whisper normalization\n", + "\n", + "- Massively Multilingual Speech (MMS) models by Meta\n", + "- Conformer 1 by AssemblyAI\n", + "- Conformer 2 by AssemblyAI" ] }, { diff --git a/settings.ini b/settings.ini index 3fe9245..a7e5cf2 100644 --- a/settings.ini +++ b/settings.ini @@ -1,7 +1,7 @@ [DEFAULT] repo = whisper_normalizer lib_name = whisper_normalizer -version = 0.0.3 +version = 0.0.4 min_python = 3.7 license = apache2 black_formatting = True diff --git a/whisper_normalizer/__init__.py b/whisper_normalizer/__init__.py index 27fdca4..81f0fde 100644 --- a/whisper_normalizer/__init__.py +++ b/whisper_normalizer/__init__.py @@ -1 +1 @@ -__version__ = "0.0.3" +__version__ = "0.0.4" diff --git a/whisper_normalizer/_modidx.py b/whisper_normalizer/_modidx.py index 8e8fe66..cca1620 100644 --- a/whisper_normalizer/_modidx.py +++ b/whisper_normalizer/_modidx.py @@ -40,4 +40,7 @@ 'whisper_normalizer/english.py'), 'whisper_normalizer.english.EnglishTextNormalizer.__init__': ( 'english.html#englishtextnormalizer.__init__', 'whisper_normalizer/english.py')}, - 'whisper_normalizer.malayalam': {}}} + 'whisper_normalizer.malayalam': { 'whisper_normalizer.malayalam.MalayalamTextNormalizer': ( 'malayalam.html#malayalamtextnormalizer', + 'whisper_normalizer/malayalam.py'), + 'whisper_normalizer.malayalam.MalayalamTextNormalizer.__call__': ( 'malayalam.html#malayalamtextnormalizer.__call__', + 'whisper_normalizer/malayalam.py')}}} diff --git a/whisper_normalizer/malayalam.py b/whisper_normalizer/malayalam.py index 2f5edb1..9739a2e 100644 --- a/whisper_normalizer/malayalam.py +++ b/whisper_normalizer/malayalam.py @@ -1,10 +1,40 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/03_malayalam.ipynb. # %% auto 0 -__all__ = [] +__all__ = ['MalayalamTextNormalizer'] # %% ../nbs/03_malayalam.ipynb 3 import re import unicodedata import regex + +# %% ../nbs/03_malayalam.ipynb 4 +class MalayalamTextNormalizer: + """BasicTextNormalizer removes vowels in languages like Malayalam which was identified by [kavya in this tweet](https://twitter.com/kavya_manohar/status/1752574864618365059). + + So this is an attempt to build text normalizer for malayalam""" + + # self.clean = ( + # remove_symbols_and_diacritics if remove_diacritics else remove_symbols + # ) + # self.split_letters = split_letters + + def __call__(self, s: str): + s = s.lower() + s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets + s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis + s = re.sub(r"\s+'", "'", s) # when there's a space before an apostrophe + # remove numbers and English alphabets from a string + s = re.sub("[a-zA-Z0-9]", "", s) + # remove punctuation characters + s = re.sub("[-,.:]", "", s) + + # if self.split_letters: + # s = " ".join(regex.findall(r"\X", s, regex.U)) + + s = re.sub( + r"\s+", " ", s + ) # replace any successive whitespace characters with a space + + return s