Skip to content

Commit

Permalink
feat: add url pattern and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
PaulJPhilp committed Mar 25, 2024
1 parent 86a1028 commit 0fdbf6c
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 98 deletions.
112 changes: 24 additions & 88 deletions src/patterns/URL.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
//

import { buildRegExp } from '../builders';
import { endOfString, startOfString, wordBoundary } from '../constructs/anchors';
import { endOfString, startOfString } from '../constructs/anchors';
import { anyOf, charClass, charRange, digit } from '../constructs/character-class';
import { choiceOf } from '../constructs/choice-of';
import { repeat } from '../constructs/repeat';
import { capture } from '../constructs/capture';
import { oneOrMore, optional } from '../constructs/quantifiers';
import type { RegexSequence } from '../types';
// import type { RegexElement, RegexSequence } from '../types';
// import { lookahead } from '../constructs/lookahead';

Expand All @@ -29,7 +30,7 @@ const alphabetical = charClass(lowercase, uppercase);
const specialChars = anyOf('._%+-');
const portSeperator = ':';
const schemeSeperator = ':';
//const doubleSlash = '//';
const doubleSlash = '//';

const pathSeparator = '/';
const querySeparator = '?';
Expand Down Expand Up @@ -71,104 +72,39 @@ export const UrlSchemeValidator = buildRegExp([startOfString, capture(urlScheme)
// 3. An optional port number, preceded by a colon (:)

This comment has been minimized.

Copy link
@mdjastrzebski

mdjastrzebski Mar 26, 2024

It would be hand to include links to URL specs, so that it's easy to reference the implmentation.

This comment has been minimized.

Copy link
@PaulJPhilp

PaulJPhilp via email Mar 26, 2024

Author Owner
// Authority = [userinfo "@"] host [":" port]

/***
// Host: No Repeat, Eager Version
const hostnameEager = capture(repeat(hostnameChars, { min: 1, max: 255, greedy: false }));
const hostEagerNoRepeat = capture([hostnameEager, period, hostnameEager]);
export const hostEagerNoRepeatFinder = buildRegExp(hostEagerNoRepeat, {
ignoreCase: true,
global: true,
});
export const hostEagerNoRepeatValidator = buildRegExp(
[startOfString, hostEagerNoRepeat, endOfString],
{ ignoreCase: true },
);
// Host: No Repeat, Greedy Version
const hostnameGreedy = capture(repeat(hostnameChars, { min: 1, max: 255, greedy: true }));
const hostGreedyNoRepeat = capture([hostnameGreedy, period, hostnameGreedy]);
export const hostGreedyNoRepeatFinder = buildRegExp(hostEagerNoRepeat, {
ignoreCase: true,
global: true,
});
export const hostGreedyNoRepeatValidator = buildRegExp(
[startOfString, hostGreedyNoRepeat, endOfString],
{ ignoreCase: true },
);
// Host: ZeroOrMore, Eager Version
const hostEagerZeroOrMore = capture([hostnameEager, zeroOrMore([period, hostnameEager])]);
export const hostEagerZeroOrMoreFinder = buildRegExp(hostEagerZeroOrMore, {
ignoreCase: true,
global: true,
});
export const hostEagerZeroOrMoreValidator = buildRegExp(
[startOfString, hostEagerZeroOrMore, endOfString],
{ ignoreCase: true },
);
// Host: with Repeat, Greedy Version
const hostGreedyWithRepeat = capture([hostnameGreedy, repeat([period, hostnameGreedy], { min: 1, max: 255 })]);
export const hostGreedyWithRepeatFinder = buildRegExp(hostGreedyWithRepeat, {
ignoreCase: true,
global: true,
});
export const hostGreedyWithRepeatValidator = buildRegExp(
[startOfString, hostGreedyWithRepeat, endOfString],
{ ignoreCase: true },
);
// Host: ZeroOrMore, Greedy Version
const hostGreedyZeroOrMore = capture([hostnameGreedy, zeroOrMore([period, hostnameGreedy])]);
export const hostGreedyZeroOrMoreFinder = buildRegExp(hostGreedyZeroOrMore, {
ignoreCase: true,
global: true,
});
export const hostGreedyZeroOrMoreValidator = buildRegExp(
[startOfString, hostGreedyZeroOrMore, endOfString],
{ ignoreCase: true },
);
***/

const userInfo = oneOrMore(usernameChars);
const portNumber = capture(repeat(digit, { min: 1, max: 5, greedy: false }));
const port = capture([portSeperator, portNumber]);
const host = capture(repeat(hostnameChars, { min: 1, max: 255, greedy: false }));
const hostname = capture([host, optional(repeat([period, host], { min: 1, max: 255 }))]);
const urlAuthority = capture([optional([userInfo, at]), hostname, optional(port)]);
const portNumber = repeat(digit, { min: 1, max: 5, greedy: false });
const port = [portSeperator, portNumber];
const host = repeat(hostnameChars, { min: 1, max: 255, greedy: false });
const hostname = [host, optional(repeat([period, host], { min: 1, max: 255 }))];

const urlAuthority: RegexSequence = [optional([userInfo, at]), choiceOf(hostname), optional(port)];

This comment has been minimized.

Copy link
@mdjastrzebski

mdjastrzebski Mar 26, 2024

Would you benefit either from:
a) a new helper, e.g. const urlAuthority: RegexSequence = regex([optional([userInfo, at]), choiceOf(hostname), optional(port)]) that would enable just a regex sequence, but wrapped as a single RegexElement?
b) alternatively, ability to embed RegexSequence as a element in RegexSequence, where encoding process would just flatten all these nested arrays? buildRegExp([startOfString, [optional([userInfo, at]), choiceOf(hostname), optional(port)], endOfString]));

This comment has been minimized.

Copy link
@PaulJPhilp

PaulJPhilp via email Mar 26, 2024

Author Owner

export const UrlAuthorityFinder = buildRegExp(urlAuthority, {
ignoreCase: true,
global: true,
});

export const UrlAuthorityValidator = buildRegExp([startOfString, urlAuthority, endOfString], {
ignoreCase: true,
});
export const UrlAuthorityValidator = buildRegExp(
[startOfString, choiceOf(urlAuthority), endOfString],
{
ignoreCase: true,
},
);

//
// Convenience Pattern - Host:
// A hostname (e.g. www.google.com)
//

const urlHost = [host, choiceOf([pathSeparator, wordBoundary, endOfString])];
const urlHost = choiceOf(hostname);

export const UrlHostFinder = buildRegExp(capture(urlHost), {
export const UrlHostFinder = buildRegExp(urlHost, {
ignoreCase: true,
global: true,
});

export const UrlHostValidator = buildRegExp(capture(urlHost), {
ignoreCase: true,
});
export const UrlHostValidator = buildRegExp(urlHost, { ignoreCase: true });

// Path:
// The path is the part of the URL that comes after the authority and before the query.
Expand Down Expand Up @@ -228,16 +164,14 @@ export const UrlFragmentValidator = buildRegExp(urlFragment, {
ignoreCase: true,
});

const url = capture([
startOfString,
const url = [

This comment has been minimized.

Copy link
@mdjastrzebski

mdjastrzebski Mar 26, 2024

I think that various formal parts of URL schema should be exportable inside the library (and maybe event outside) as building blocks. Email validator will benefit from access to hostname, etc

This comment has been minimized.

Copy link
@PaulJPhilp

PaulJPhilp via email Mar 26, 2024

Author Owner

This comment has been minimized.

Copy link
@PaulJPhilp

PaulJPhilp via email Mar 26, 2024

Author Owner
optional(urlScheme),
schemeSeperator,
optional(urlAuthority),
optional([doubleSlash, choiceOf(urlAuthority)]),
urlPath,
optional(urlQuery),
optional(urlFragment),
endOfString,
]);
];

/***
*** Find URL strings in a text.
Expand All @@ -252,4 +186,6 @@ export const urlFinder = buildRegExp(url, {
*** Check that given text is a valid URL.
***/

export const urlValidator = buildRegExp([startOfString, url, endOfString], { ignoreCase: true });
export const urlValidator = buildRegExp([startOfString, choiceOf(url), endOfString], {
ignoreCase: true,
});
21 changes: 11 additions & 10 deletions src/patterns/__tests__/Url.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
import {
UrlAuthorityFinder,
UrlAuthorityValidator,
UrlHostFinder,
UrlHostValidator,
UrlSchemeFinder,
UrlSchemeValidator,
} from '../URL';
//import { hostGreedyNoRepeatValidator, hostEagerNoRepeatValidator, hostEagerZeroOrMoreValidator, hostGreedyZeroOrMoreValidator} from '../URL';
//import { UrlHostFinder, UrlHostValidator } from '../URL';

test('urlSchemeValidator', () => {
expect(UrlSchemeValidator).toMatchString('ftp:');
Expand Down Expand Up @@ -42,13 +42,14 @@ test('UrlAuthorityValidator', () => {
expect(UrlAuthorityValidator).toMatchString('abba@aaaa.aaaaaaa');
});

test('UrlHostValidator', () => {
expect(UrlHostValidator).toMatchString('www.google.com');
});

test('UrlHostFinder', () => {
expect(UrlHostFinder).toMatchString('www.google.com');
});

test('UrlAuthorityFinder', () => {
expect(UrlAuthorityFinder).toMatchAllGroups(
'The best place to search is https://www.google.com',
[['www.google.com', 'www.google.com', 'www.google.com']],
);
//expect(UrlAuthorityFinder).toMatchAllGroups('The alternatives are www.bing.com, perplexity.ai and OpenAI', [
//['www.bing.com'],
//['perplexity.ai'],
//]);
expect(UrlAuthorityFinder).toMatchString('abba@a');
});

2 comments on commit 0fdbf6c

@mdjastrzebski
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@PaulJPhilp looks like this is going in the right direction. URL is a complex pattern which should be decomposed into smaller building blocks. These building blocks might be also userful to form other patterns, e.g. email would benefit from hostname, userIfno, etc./

When possible, let's stick with the official RFC naming for these parts.

@PaulJPhilp
Copy link
Owner Author

@PaulJPhilp PaulJPhilp commented on 0fdbf6c Mar 26, 2024 via email

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.