From 9fa45bc6430c53bc5ac97a25c618d45ebadfc20e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ale=C5=A1=20Roub=C3=AD=C4=8Dek?= <ales@roubicek.name>
Date: Mon, 30 Sep 2024 15:36:21 +0200
Subject: [PATCH 1/6] fix: Make srcset parsing HTML spec compliant (#16323)

---
 .../vite/src/node/__tests__/utils.spec.ts     | 10 +++
 packages/vite/src/node/utils.ts               | 69 ++++++++-----------
 2 files changed, 39 insertions(+), 40 deletions(-)
diff --git a/packages/vite/src/node/__tests__/utils.spec.ts b/packages/vite/src/node/__tests__/utils.spec.ts
index ac10229fc1b2e5..e022d04fcf328c 100644
--- a/packages/vite/src/node/__tests__/utils.spec.ts
+++ b/packages/vite/src/node/__tests__/utils.spec.ts
@@ -359,6 +359,16 @@ describe('processSrcSetSync', () => {
       processSrcSetSync('https://anydomain/image.jpg', ({ url }) => url),
     ).toBe(source)
   })
+
+  test('should not break URLs with commas in srcSet', async () => {
+    const source = `
+      \thttps://example.com/dpr_1,f_auto,fl_progressive,q_auto,w_100/v1/img   1x,
+      \thttps://example.com/dpr_2,f_auto,fl_progressive,q_auto,w_100/v1/img\t\t2x
+    `
+    const result =
+      'https://example.com/dpr_1,f_auto,fl_progressive,q_auto,w_100/v1/img 1x, https://example.com/dpr_2,f_auto,fl_progressive,q_auto,w_100/v1/img 2x'
+    expect(processSrcSetSync(source, ({ url }) => url)).toBe(result)
+  })
 })
 
 describe('flattenId', () => {
diff --git a/packages/vite/src/node/utils.ts b/packages/vite/src/node/utils.ts
index 254b52b468e4fa..d94b1e043d5b37 100644
--- a/packages/vite/src/node/utils.ts
+++ b/packages/vite/src/node/utils.ts
@@ -720,26 +720,40 @@ interface ImageCandidate {
   url: string
   descriptor: string
 }
-const escapedSpaceCharacters = /(?: |\\t|\\n|\\f|\\r)+/g
-const imageSetUrlRE = /^(?:[\w-]+\(.*?\)|'.*?'|".*?"|\S*)/
+
 function joinSrcset(ret: ImageCandidate[]) {
   return ret
     .map(({ url, descriptor }) => url + (descriptor ? ` ${descriptor}` : ''))
     .join(', ')
 }
 
-// NOTE: The returned `url` should perhaps be decoded so all handled URLs within Vite are consistently decoded.
-// However, this may also require a refactor for `cssReplacer` to accept decoded URLs instead.
-function splitSrcSetDescriptor(srcs: string): ImageCandidate[] {
-  return splitSrcSet(srcs)
-    .map((s) => {
-      const src = s.replace(escapedSpaceCharacters, ' ').trim()
-      const url = imageSetUrlRE.exec(src)?.[0] ?? ''
+/**
+ This regex represents a loose rule of an “image candidate string”.
+
+ @see https://html.spec.whatwg.org/multipage/images.html#srcset-attribute
 
-      return {
-        url,
-        descriptor: src.slice(url.length).trim(),
-      }
+  An “image candidate string” roughly consists of the following:
+  1. Zero or more whitespace characters.
+  2. A non-empty URL that does not start or end with `,`.
+  3. Zero or more whitespace characters.
+  4. An optional “descriptor” that starts with a whitespace character.
+  5. Zero or more whitespace characters.
+  6. Each image candidate string is separated by a `,`.
+ */
+// eslint-disable-next-line regexp/no-super-linear-backtracking
+const imageCandidateRegex = /\s*([^,]\S*[^,](?:\s[^,]+)?)\s*(?:,|$)/
+const escapedSpaceCharacters = /(?: |\\t|\\n|\\f|\\r)+/g
+
+export function parseSrcset(string: string): ImageCandidate[] {
+  return string
+    .replace(escapedSpaceCharacters, ' ')
+    .replace(/\r?\n/, '')
+    .replace(/,\s+/, ', ')
+    .split(imageCandidateRegex)
+    .filter((_part, index) => index % 2 === 1)
+    .map((part) => {
+      const [url, descriptor] = part.trim().split(/\s+/)
+      return { url, descriptor }
     })
     .filter(({ url }) => !!url)
 }
@@ -749,7 +763,7 @@ export function processSrcSet(
   replacer: (arg: ImageCandidate) => Promise<string>,
 ): Promise<string> {
   return Promise.all(
-    splitSrcSetDescriptor(srcs).map(async ({ url, descriptor }) => ({
+    parseSrcset(srcs).map(async ({ url, descriptor }) => ({
       url: await replacer({ url, descriptor }),
       descriptor,
     })),
@@ -761,38 +775,13 @@ export function processSrcSetSync(
   replacer: (arg: ImageCandidate) => string,
 ): string {
   return joinSrcset(
-    splitSrcSetDescriptor(srcs).map(({ url, descriptor }) => ({
+    parseSrcset(srcs).map(({ url, descriptor }) => ({
       url: replacer({ url, descriptor }),
       descriptor,
     })),
   )
 }
 
-const cleanSrcSetRE =
-  /(?:url|image|gradient|cross-fade)\([^)]*\)|"([^"]|(?<=\\)")*"|'([^']|(?<=\\)')*'|data:\w+\/[\w.+-]+;base64,[\w+/=]+|\?\S+,/g
-function splitSrcSet(srcs: string) {
-  const parts: string[] = []
-  /**
-   * There could be a ',' inside of:
-   * - url(data:...)
-   * - linear-gradient(...)
-   * - "data:..."
-   * - data:...
-   * - query parameter ?...
-   */
-  const cleanedSrcs = srcs.replace(cleanSrcSetRE, blankReplacer)
-  let startIndex = 0
-  let splitIndex: number
-  do {
-    splitIndex = cleanedSrcs.indexOf(',', startIndex)
-    parts.push(
-      srcs.slice(startIndex, splitIndex !== -1 ? splitIndex : undefined),
-    )
-    startIndex = splitIndex + 1
-  } while (splitIndex !== -1)
-  return parts
-}
-
 const windowsDriveRE = /^[A-Z]:/
 const replaceWindowsDriveRE = /^([A-Z]):\//
 const linuxAbsolutePathRE = /^\/[^/]/

From 04505c703158a88ccca34b66e9cff0990ef44d47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ale=C5=A1=20Roub=C3=AD=C4=8Dek?= <ales@roubicek.name>
Date: Tue, 1 Oct 2024 06:16:02 +0200
Subject: [PATCH 2/6] fix: make srcset parsing also conformant with CSS
 image-set spec

---
 .../vite/src/node/__tests__/utils.spec.ts     | 38 +++++++++++++++++++
 packages/vite/src/node/utils.ts               | 13 +++++--
 2 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/packages/vite/src/node/__tests__/utils.spec.ts b/packages/vite/src/node/__tests__/utils.spec.ts
index e022d04fcf328c..c61e1e6695d9ef 100644
--- a/packages/vite/src/node/__tests__/utils.spec.ts
+++ b/packages/vite/src/node/__tests__/utils.spec.ts
@@ -369,6 +369,44 @@ describe('processSrcSetSync', () => {
       'https://example.com/dpr_1,f_auto,fl_progressive,q_auto,w_100/v1/img 1x, https://example.com/dpr_2,f_auto,fl_progressive,q_auto,w_100/v1/img 2x'
     expect(processSrcSetSync(source, ({ url }) => url)).toBe(result)
   })
+
+  test('should not break URLs with commas in image-set-options', async () => {
+    const source = `url(https://example.com/dpr_1,f_auto,fl_progressive,q_auto,w_100/v1/img)   1x,
+      url("https://example.com/dpr_2,f_auto,fl_progressive,q_auto,w_100/v1/img")\t\t2x
+    `
+    const result =
+      'url(https://example.com/dpr_1,f_auto,fl_progressive,q_auto,w_100/v1/img) 1x, url("https://example.com/dpr_2,f_auto,fl_progressive,q_auto,w_100/v1/img") 2x'
+    expect(processSrcSetSync(source, ({ url }) => url)).toBe(result)
+  })
+
+  test('should parse image-set-options with resolution', async () => {
+    const source = ` "foo.png" 1x,
+                     "foo-2x.png" 2x,
+                     "foo-print.png" 600dpi`
+    const result = '"foo.png" 1x, "foo-2x.png" 2x, "foo-print.png" 600dpi'
+    expect(processSrcSetSync(source, ({ url }) => url)).toBe(result)
+  })
+
+  test('should parse image-set-options with type', async () => {
+    const source = ` "foo.avif" type("image/avif"),
+                     "foo.jpg" type("image/jpeg") `
+    const result = '"foo.avif" type("image/avif"), "foo.jpg" type("image/jpeg")'
+    expect(processSrcSetSync(source, ({ url }) => url)).toBe(result)
+  })
+
+  test('should parse image-set-options with linear-gradient', async () => {
+    const source = `linear-gradient(cornflowerblue, white) 1x,
+                    url("detailed-gradient.png") 3x`
+    const result =
+      'linear-gradient(cornflowerblue, white) 1x, url("detailed-gradient.png") 3x'
+    expect(processSrcSetSync(source, ({ url }) => url)).toBe(result)
+  })
+
+  test('should parse image-set-options with resolution and type specified', async () => {
+    const source = `url("picture.png")\t1x\t type("image/jpeg")`
+    const result = 'url("picture.png") 1x type("image/jpeg")'
+    expect(processSrcSetSync(source, ({ url }) => url)).toBe(result)
+  })
 })
 
 describe('flattenId', () => {
diff --git a/packages/vite/src/node/utils.ts b/packages/vite/src/node/utils.ts
index d94b1e043d5b37..037fe5641330c2 100644
--- a/packages/vite/src/node/utils.ts
+++ b/packages/vite/src/node/utils.ts
@@ -727,10 +727,15 @@ function joinSrcset(ret: ImageCandidate[]) {
     .join(', ')
 }
 
+/*!
+ * Based on https://github.com/sindresorhus/srcset
+ * MIT License, Copyright (c) Sindre Sorhus <sindresorhus@gmail.com> (https://sindresorhus.com)
+ */
 /**
- This regex represents a loose rule of an “image candidate string”.
+ This regex represents a loose rule of an “image candidate string” and "image set options".
 
  @see https://html.spec.whatwg.org/multipage/images.html#srcset-attribute
+ @see https://drafts.csswg.org/css-images-4/#image-set-notation
 
   An “image candidate string” roughly consists of the following:
   1. Zero or more whitespace characters.
@@ -752,8 +757,10 @@ export function parseSrcset(string: string): ImageCandidate[] {
     .split(imageCandidateRegex)
     .filter((_part, index) => index % 2 === 1)
     .map((part) => {
-      const [url, descriptor] = part.trim().split(/\s+/)
-      return { url, descriptor }
+      const [url, ...descriptors] = part.trim().split(/\s+/)
+      // in `image-set()` context descriptor can have `resolution` and `type` parts separated by whitespace,
+      // we need to join them back together
+      return { url, descriptor: descriptors.join(' ') }
     })
     .filter(({ url }) => !!url)
 }

From d4faba99f281786f3dfb65168705e2579d122925 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ale=C5=A1=20Roub=C3=AD=C4=8Dek?= <ales@roubicek.name>
Date: Tue, 1 Oct 2024 14:41:53 +0200
Subject: [PATCH 3/6] fix: srcset - rework regex to extract url and descriptor
 via group matches

---
 .../vite/src/node/__tests__/utils.spec.ts     | 15 ++++++
 packages/vite/src/node/utils.ts               | 47 +++++++------------
 2 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/packages/vite/src/node/__tests__/utils.spec.ts b/packages/vite/src/node/__tests__/utils.spec.ts
index c61e1e6695d9ef..784df1f1b38d2e 100644
--- a/packages/vite/src/node/__tests__/utils.spec.ts
+++ b/packages/vite/src/node/__tests__/utils.spec.ts
@@ -407,6 +407,21 @@ describe('processSrcSetSync', () => {
     const result = 'url("picture.png") 1x type("image/jpeg")'
     expect(processSrcSetSync(source, ({ url }) => url)).toBe(result)
   })
+
+  test('should capture whole image set options', async () => {
+    const source = `linear-gradient(cornflowerblue, white) 1x,
+                    url("detailed-gradient.png") 3x`
+    const expected = [
+      'linear-gradient(cornflowerblue, white)',
+      'url("detailed-gradient.png")',
+    ]
+    const result: string[] = []
+    processSrcSetSync(source, ({ url }) => {
+      result.push(url)
+      return url
+    })
+    expect(result).toEqual(expected)
+  })
 })
 
 describe('flattenId', () => {
diff --git a/packages/vite/src/node/utils.ts b/packages/vite/src/node/utils.ts
index 037fe5641330c2..635deea549e992 100644
--- a/packages/vite/src/node/utils.ts
+++ b/packages/vite/src/node/utils.ts
@@ -727,42 +727,29 @@ function joinSrcset(ret: ImageCandidate[]) {
     .join(', ')
 }
 
-/*!
- * Based on https://github.com/sindresorhus/srcset
- * MIT License, Copyright (c) Sindre Sorhus <sindresorhus@gmail.com> (https://sindresorhus.com)
- */
-/**
- This regex represents a loose rule of an “image candidate string” and "image set options".
-
- @see https://html.spec.whatwg.org/multipage/images.html#srcset-attribute
- @see https://drafts.csswg.org/css-images-4/#image-set-notation
-
-  An “image candidate string” roughly consists of the following:
-  1. Zero or more whitespace characters.
-  2. A non-empty URL that does not start or end with `,`.
-  3. Zero or more whitespace characters.
-  4. An optional “descriptor” that starts with a whitespace character.
-  5. Zero or more whitespace characters.
-  6. Each image candidate string is separated by a `,`.
- */
-// eslint-disable-next-line regexp/no-super-linear-backtracking
-const imageCandidateRegex = /\s*([^,]\S*[^,](?:\s[^,]+)?)\s*(?:,|$)/
 const escapedSpaceCharacters = /(?: |\\t|\\n|\\f|\\r)+/g
 
 export function parseSrcset(string: string): ImageCandidate[] {
-  return string
+  const matches = string
+    .trim()
     .replace(escapedSpaceCharacters, ' ')
     .replace(/\r?\n/, '')
     .replace(/,\s+/, ', ')
-    .split(imageCandidateRegex)
-    .filter((_part, index) => index % 2 === 1)
-    .map((part) => {
-      const [url, ...descriptors] = part.trim().split(/\s+/)
-      // in `image-set()` context descriptor can have `resolution` and `type` parts separated by whitespace,
-      // we need to join them back together
-      return { url, descriptor: descriptors.join(' ') }
-    })
-    .filter(({ url }) => !!url)
+    .replaceAll(/\s+/g, ' ')
+    .matchAll(
+      /**
+       This regex represents a loose rule of an “image candidate string” and "image set options".
+
+       @see https://html.spec.whatwg.org/multipage/images.html#srcset-attribute
+       @see https://drafts.csswg.org/css-images-4/#image-set-notation
+       */
+      // eslint-disable-next-line regexp/no-super-linear-backtracking
+      /\s*(?<url>[\w-]+\([^)]*\)|"[^"]*"|'[^']*'|[^,]\S*[^,])\s*(?<descriptor>[^,]*(?:\stype\([^)]*\))?)\s*(?:,|$)/g,
+    )
+  return Array.from(matches, ({ groups }) => ({
+    url: groups.url.trim(),
+    descriptor: groups.descriptor.trim(),
+  })).filter(({ url }) => !!url)
 }
 
 export function processSrcSet(

From ac100295e2b3203db92b709ddd69f547d200ba18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ale=C5=A1=20Roub=C3=AD=C4=8Dek?= <ales@roubicek.name>
Date: Tue, 1 Oct 2024 14:46:23 +0200
Subject: [PATCH 4/6] fix possibly undefined groups

---
 packages/vite/src/node/utils.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/vite/src/node/utils.ts b/packages/vite/src/node/utils.ts
index 635deea549e992..e7398a3f4ee743 100644
--- a/packages/vite/src/node/utils.ts
+++ b/packages/vite/src/node/utils.ts
@@ -747,8 +747,8 @@ export function parseSrcset(string: string): ImageCandidate[] {
       /\s*(?<url>[\w-]+\([^)]*\)|"[^"]*"|'[^']*'|[^,]\S*[^,])\s*(?<descriptor>[^,]*(?:\stype\([^)]*\))?)\s*(?:,|$)/g,
     )
   return Array.from(matches, ({ groups }) => ({
-    url: groups.url.trim(),
-    descriptor: groups.descriptor.trim(),
+    url: groups?.url?.trim() ?? '',
+    descriptor: groups?.descriptor?.trim() ?? '',
   })).filter(({ url }) => !!url)
 }
 

From 2349c9b0cf0167f48672a8d7ff8239742819ab77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ale=C5=A1=20Roub=C3=AD=C4=8Dek?= <ales@roubicek.name>
Date: Wed, 2 Oct 2024 04:55:49 +0200
Subject: [PATCH 5/6] improved regex so we can remove eslint ignore rule,
 improved doc comment for regex

---
 packages/vite/src/node/utils.ts | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/packages/vite/src/node/utils.ts b/packages/vite/src/node/utils.ts
index e7398a3f4ee743..3d8c6ad4aacc0a 100644
--- a/packages/vite/src/node/utils.ts
+++ b/packages/vite/src/node/utils.ts
@@ -727,6 +727,22 @@ function joinSrcset(ret: ImageCandidate[]) {
     .join(', ')
 }
 
+/**
+ This regex represents a loose rule of an “image candidate string” and "image set options".
+
+ @see https://html.spec.whatwg.org/multipage/images.html#srcset-attribute
+ @see https://drafts.csswg.org/css-images-4/#image-set-notation
+
+  The Regex has named capturing groups `url` and `descriptor`.
+  The `url` group can be:
+  * any CSS function
+  * CSS string (single or double-quoted)
+  * or URL string.
+  The `descriptor` is anything after the space and before the comma,
+  and can have optional `type(...)` for the `image-set` case.
+ */
+const imageCandidateRegex =
+  /(?:^|\s)(?<url>[\w-]+\([^)]*\)|"[^"]*"|'[^']*'|[^,]\S*[^,])\s*(?:\s(?<descriptor>[^,\s]+(?:\stype\([^)]*\))?)\s*)?(?:,|$)/g
 const escapedSpaceCharacters = /(?: |\\t|\\n|\\f|\\r)+/g
 
 export function parseSrcset(string: string): ImageCandidate[] {
@@ -736,16 +752,7 @@ export function parseSrcset(string: string): ImageCandidate[] {
     .replace(/\r?\n/, '')
     .replace(/,\s+/, ', ')
     .replaceAll(/\s+/g, ' ')
-    .matchAll(
-      /**
-       This regex represents a loose rule of an “image candidate string” and "image set options".
-
-       @see https://html.spec.whatwg.org/multipage/images.html#srcset-attribute
-       @see https://drafts.csswg.org/css-images-4/#image-set-notation
-       */
-      // eslint-disable-next-line regexp/no-super-linear-backtracking
-      /\s*(?<url>[\w-]+\([^)]*\)|"[^"]*"|'[^']*'|[^,]\S*[^,])\s*(?<descriptor>[^,]*(?:\stype\([^)]*\))?)\s*(?:,|$)/g,
-    )
+    .matchAll(imageCandidateRegex)
   return Array.from(matches, ({ groups }) => ({
     url: groups?.url?.trim() ?? '',
     descriptor: groups?.descriptor?.trim() ?? '',

From 770d6330cbd2af2feab6379e44c3eec604a253c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ale=C5=A1=20Roub=C3=AD=C4=8Dek?= <ales@roubicek.name>
Date: Wed, 2 Oct 2024 08:39:14 +0200
Subject: [PATCH 6/6] Makes descriptor regex more generic, added test case for
 any order of descriptor parts

---
 packages/vite/src/node/__tests__/utils.spec.ts | 5 +++--
 packages/vite/src/node/utils.ts                | 7 +++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/packages/vite/src/node/__tests__/utils.spec.ts b/packages/vite/src/node/__tests__/utils.spec.ts
index 784df1f1b38d2e..76e11050d8d401 100644
--- a/packages/vite/src/node/__tests__/utils.spec.ts
+++ b/packages/vite/src/node/__tests__/utils.spec.ts
@@ -403,8 +403,9 @@ describe('processSrcSetSync', () => {
   })
 
   test('should parse image-set-options with resolution and type specified', async () => {
-    const source = `url("picture.png")\t1x\t type("image/jpeg")`
-    const result = 'url("picture.png") 1x type("image/jpeg")'
+    const source = `url("picture.png")\t1x\t type("image/jpeg"), url("picture.png")\t type("image/jpeg")\t2x`
+    const result =
+      'url("picture.png") 1x type("image/jpeg"), url("picture.png") type("image/jpeg") 2x'
     expect(processSrcSetSync(source, ({ url }) => url)).toBe(result)
   })
 
diff --git a/packages/vite/src/node/utils.ts b/packages/vite/src/node/utils.ts
index 3d8c6ad4aacc0a..6c54841118fb19 100644
--- a/packages/vite/src/node/utils.ts
+++ b/packages/vite/src/node/utils.ts
@@ -737,12 +737,11 @@ function joinSrcset(ret: ImageCandidate[]) {
   The `url` group can be:
   * any CSS function
   * CSS string (single or double-quoted)
-  * or URL string.
-  The `descriptor` is anything after the space and before the comma,
-  and can have optional `type(...)` for the `image-set` case.
+  * URL string (unquoted)
+  The `descriptor` is anything after the space and before the comma.
  */
 const imageCandidateRegex =
-  /(?:^|\s)(?<url>[\w-]+\([^)]*\)|"[^"]*"|'[^']*'|[^,]\S*[^,])\s*(?:\s(?<descriptor>[^,\s]+(?:\stype\([^)]*\))?)\s*)?(?:,|$)/g
+  /(?:^|\s)(?<url>[\w-]+\([^)]*\)|"[^"]*"|'[^']*'|[^,]\S*[^,])\s*(?:\s(?<descriptor>\w[^,]+))?(?:,|$)/g
 const escapedSpaceCharacters = /(?: |\\t|\\n|\\f|\\r)+/g
 
 export function parseSrcset(string: string): ImageCandidate[] {