From 0fdbf6c2fe6f69ea70dfdbb26a9991d1ddf614d5 Mon Sep 17 00:00:00 2001 From: PaulJPhilp <138239846+PaulJPhilp@users.noreply.github.com> Date: Mon, 25 Mar 2024 15:44:43 -0400 Subject: [PATCH] feat: add url pattern and tests --- src/patterns/URL.ts | 112 ++++++++-------------------------- src/patterns/__tests__/Url.ts | 21 ++++--- 2 files changed, 35 insertions(+), 98 deletions(-) diff --git a/src/patterns/URL.ts b/src/patterns/URL.ts index fcea1f8..42478fa 100644 --- a/src/patterns/URL.ts +++ b/src/patterns/URL.ts @@ -7,12 +7,13 @@ // import { buildRegExp } from '../builders'; -import { endOfString, startOfString, wordBoundary } from '../constructs/anchors'; +import { endOfString, startOfString } from '../constructs/anchors'; import { anyOf, charClass, charRange, digit } from '../constructs/character-class'; import { choiceOf } from '../constructs/choice-of'; import { repeat } from '../constructs/repeat'; import { capture } from '../constructs/capture'; import { oneOrMore, optional } from '../constructs/quantifiers'; +import type { RegexSequence } from '../types'; // import type { RegexElement, RegexSequence } from '../types'; // import { lookahead } from '../constructs/lookahead'; @@ -29,7 +30,7 @@ const alphabetical = charClass(lowercase, uppercase); const specialChars = anyOf('._%+-'); const portSeperator = ':'; const schemeSeperator = ':'; -//const doubleSlash = '//'; +const doubleSlash = '//'; const pathSeparator = '/'; const querySeparator = '?'; @@ -71,104 +72,39 @@ export const UrlSchemeValidator = buildRegExp([startOfString, capture(urlScheme) // 3. An optional port number, preceded by a colon (:) // Authority = [userinfo "@"] host [":" port] -/*** -// Host: No Repeat, Eager Version -const hostnameEager = capture(repeat(hostnameChars, { min: 1, max: 255, greedy: false })); -const hostEagerNoRepeat = capture([hostnameEager, period, hostnameEager]); -export const hostEagerNoRepeatFinder = buildRegExp(hostEagerNoRepeat, { - ignoreCase: true, - global: true, -}); - -export const hostEagerNoRepeatValidator = buildRegExp( - [startOfString, hostEagerNoRepeat, endOfString], - { ignoreCase: true }, -); - -// Host: No Repeat, Greedy Version -const hostnameGreedy = capture(repeat(hostnameChars, { min: 1, max: 255, greedy: true })); -const hostGreedyNoRepeat = capture([hostnameGreedy, period, hostnameGreedy]); -export const hostGreedyNoRepeatFinder = buildRegExp(hostEagerNoRepeat, { - ignoreCase: true, - global: true, -}); - -export const hostGreedyNoRepeatValidator = buildRegExp( - [startOfString, hostGreedyNoRepeat, endOfString], - { ignoreCase: true }, -); - -// Host: ZeroOrMore, Eager Version - -const hostEagerZeroOrMore = capture([hostnameEager, zeroOrMore([period, hostnameEager])]); -export const hostEagerZeroOrMoreFinder = buildRegExp(hostEagerZeroOrMore, { - ignoreCase: true, - global: true, -}); - -export const hostEagerZeroOrMoreValidator = buildRegExp( - [startOfString, hostEagerZeroOrMore, endOfString], - { ignoreCase: true }, -); - -// Host: with Repeat, Greedy Version - -const hostGreedyWithRepeat = capture([hostnameGreedy, repeat([period, hostnameGreedy], { min: 1, max: 255 })]); -export const hostGreedyWithRepeatFinder = buildRegExp(hostGreedyWithRepeat, { - ignoreCase: true, - global: true, -}); - -export const hostGreedyWithRepeatValidator = buildRegExp( - [startOfString, hostGreedyWithRepeat, endOfString], - { ignoreCase: true }, -); - -// Host: ZeroOrMore, Greedy Version - -const hostGreedyZeroOrMore = capture([hostnameGreedy, zeroOrMore([period, hostnameGreedy])]); -export const hostGreedyZeroOrMoreFinder = buildRegExp(hostGreedyZeroOrMore, { - ignoreCase: true, - global: true, -}); - -export const hostGreedyZeroOrMoreValidator = buildRegExp( - [startOfString, hostGreedyZeroOrMore, endOfString], - { ignoreCase: true }, -); -***/ - const userInfo = oneOrMore(usernameChars); -const portNumber = capture(repeat(digit, { min: 1, max: 5, greedy: false })); -const port = capture([portSeperator, portNumber]); -const host = capture(repeat(hostnameChars, { min: 1, max: 255, greedy: false })); -const hostname = capture([host, optional(repeat([period, host], { min: 1, max: 255 }))]); -const urlAuthority = capture([optional([userInfo, at]), hostname, optional(port)]); +const portNumber = repeat(digit, { min: 1, max: 5, greedy: false }); +const port = [portSeperator, portNumber]; +const host = repeat(hostnameChars, { min: 1, max: 255, greedy: false }); +const hostname = [host, optional(repeat([period, host], { min: 1, max: 255 }))]; + +const urlAuthority: RegexSequence = [optional([userInfo, at]), choiceOf(hostname), optional(port)]; export const UrlAuthorityFinder = buildRegExp(urlAuthority, { ignoreCase: true, global: true, }); -export const UrlAuthorityValidator = buildRegExp([startOfString, urlAuthority, endOfString], { - ignoreCase: true, -}); +export const UrlAuthorityValidator = buildRegExp( + [startOfString, choiceOf(urlAuthority), endOfString], + { + ignoreCase: true, + }, +); // // Convenience Pattern - Host: // A hostname (e.g. www.google.com) // -const urlHost = [host, choiceOf([pathSeparator, wordBoundary, endOfString])]; +const urlHost = choiceOf(hostname); -export const UrlHostFinder = buildRegExp(capture(urlHost), { +export const UrlHostFinder = buildRegExp(urlHost, { ignoreCase: true, global: true, }); -export const UrlHostValidator = buildRegExp(capture(urlHost), { - ignoreCase: true, -}); +export const UrlHostValidator = buildRegExp(urlHost, { ignoreCase: true }); // Path: // The path is the part of the URL that comes after the authority and before the query. @@ -228,16 +164,14 @@ export const UrlFragmentValidator = buildRegExp(urlFragment, { ignoreCase: true, }); -const url = capture([ - startOfString, +const url = [ optional(urlScheme), schemeSeperator, - optional(urlAuthority), + optional([doubleSlash, choiceOf(urlAuthority)]), urlPath, optional(urlQuery), optional(urlFragment), - endOfString, -]); +]; /*** *** Find URL strings in a text. @@ -252,4 +186,6 @@ export const urlFinder = buildRegExp(url, { *** Check that given text is a valid URL. ***/ -export const urlValidator = buildRegExp([startOfString, url, endOfString], { ignoreCase: true }); +export const urlValidator = buildRegExp([startOfString, choiceOf(url), endOfString], { + ignoreCase: true, +}); diff --git a/src/patterns/__tests__/Url.ts b/src/patterns/__tests__/Url.ts index 4d8375f..104aa58 100644 --- a/src/patterns/__tests__/Url.ts +++ b/src/patterns/__tests__/Url.ts @@ -2,11 +2,11 @@ import { UrlAuthorityFinder, UrlAuthorityValidator, + UrlHostFinder, + UrlHostValidator, UrlSchemeFinder, UrlSchemeValidator, } from '../URL'; -//import { hostGreedyNoRepeatValidator, hostEagerNoRepeatValidator, hostEagerZeroOrMoreValidator, hostGreedyZeroOrMoreValidator} from '../URL'; -//import { UrlHostFinder, UrlHostValidator } from '../URL'; test('urlSchemeValidator', () => { expect(UrlSchemeValidator).toMatchString('ftp:'); @@ -42,13 +42,14 @@ test('UrlAuthorityValidator', () => { expect(UrlAuthorityValidator).toMatchString('abba@aaaa.aaaaaaa'); }); +test('UrlHostValidator', () => { + expect(UrlHostValidator).toMatchString('www.google.com'); +}); + +test('UrlHostFinder', () => { + expect(UrlHostFinder).toMatchString('www.google.com'); +}); + test('UrlAuthorityFinder', () => { - expect(UrlAuthorityFinder).toMatchAllGroups( - 'The best place to search is https://www.google.com', - [['www.google.com', 'www.google.com', 'www.google.com']], - ); - //expect(UrlAuthorityFinder).toMatchAllGroups('The alternatives are www.bing.com, perplexity.ai and OpenAI', [ - //['www.bing.com'], - //['perplexity.ai'], - //]); + expect(UrlAuthorityFinder).toMatchString('abba@a'); });