gh-94808: Cover `PyUnicode_Count` in CAPI #96929

sobolevn · 2022-09-19T09:22:04Z

It is heavily inspired by

Lines 99 to 161 in cbdeda8

    
           def test_count(self): 
        
               self.checkequal(3, 'aaa', 'count', 'a') 
        
               self.checkequal(0, 'aaa', 'count', 'b') 
        
               self.checkequal(3, 'aaa', 'count', 'a') 
        
               self.checkequal(0, 'aaa', 'count', 'b') 
        
               self.checkequal(3, 'aaa', 'count', 'a') 
        
               self.checkequal(0, 'aaa', 'count', 'b') 
        
               self.checkequal(0, 'aaa', 'count', 'b') 
        
               self.checkequal(2, 'aaa', 'count', 'a', 1) 
        
               self.checkequal(0, 'aaa', 'count', 'a', 10) 
        
               self.checkequal(1, 'aaa', 'count', 'a', -1) 
        
               self.checkequal(3, 'aaa', 'count', 'a', -10) 
        
               self.checkequal(1, 'aaa', 'count', 'a', 0, 1) 
        
               self.checkequal(3, 'aaa', 'count', 'a', 0, 10) 
        
               self.checkequal(2, 'aaa', 'count', 'a', 0, -1) 
        
               self.checkequal(0, 'aaa', 'count', 'a', 0, -10) 
        
               self.checkequal(3, 'aaa', 'count', '', 1) 
        
               self.checkequal(1, 'aaa', 'count', '', 3) 
        
               self.checkequal(0, 'aaa', 'count', '', 10) 
        
               self.checkequal(2, 'aaa', 'count', '', -1) 
        
               self.checkequal(4, 'aaa', 'count', '', -10) 
        
               self.checkequal(1, '', 'count', '') 
        
               self.checkequal(0, '', 'count', '', 1, 1) 
        
               self.checkequal(0, '', 'count', '', sys.maxsize, 0) 
        
               self.checkequal(0, '', 'count', 'xx') 
        
               self.checkequal(0, '', 'count', 'xx', 1, 1) 
        
               self.checkequal(0, '', 'count', 'xx', sys.maxsize, 0) 
        
               self.checkraises(TypeError, 'hello', 'count') 
        
               if self.contains_bytes: 
        
                   self.checkequal(0, 'hello', 'count', 42) 
        
               else: 
        
                   self.checkraises(TypeError, 'hello', 'count', 42) 
        
               # For a variety of combinations, 
        
               #    verify that str.count() matches an equivalent function 
        
               #    replacing all occurrences and then differencing the string lengths 
        
               charset = ['', 'a', 'b'] 
        
               digits = 7 
        
               base = len(charset) 
        
               teststrings = set() 
        
               for i in range(base ** digits): 
        
                   entry = [] 
        
                   for j in range(digits): 
        
                       i, m = divmod(i, base) 
        
                       entry.append(charset[m]) 
        
                   teststrings.add(''.join(entry)) 
        
               teststrings = [self.fixtype(ts) for ts in teststrings] 
        
               for i in teststrings: 
        
                   n = len(i) 
        
                   for j in teststrings: 
        
                       r1 = i.count(j) 
        
                       if j: 
        
                           r2, rem = divmod(n - len(i.replace(j, self.fixtype(''))), 
        
                                            len(j)) 
        
                       else: 
        
                           r2, rem = len(i)+1, 0 
        
                       if rem or r1 != r2: 
        
                           self.assertEqual(rem, 0, '%s != 0 for %s' % (rem, i)) 
        
                           self.assertEqual(r1, r2, '%s != %s for %s' % (r1, r2, i))

Question: what is the historical context on why PyUnicode_Count is not reused in unicode_count? They look pretty similar:

cpython/Objects/unicodeobject.c

Lines 8968 to 9040 in cbdeda8

    
           Py_ssize_t 
        
           PyUnicode_Count(PyObject *str, 
        
                           PyObject *substr, 
        
                           Py_ssize_t start, 
        
                           Py_ssize_t end) 
        
           { 
        
               Py_ssize_t result; 
        
               int kind1, kind2; 
        
               const void *buf1 = NULL, *buf2 = NULL; 
        
               Py_ssize_t len1, len2; 
        
               if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) 
        
                   return -1; 
        
               kind1 = PyUnicode_KIND(str); 
        
               kind2 = PyUnicode_KIND(substr); 
        
               if (kind1 < kind2) 
        
                   return 0; 
        
               len1 = PyUnicode_GET_LENGTH(str); 
        
               len2 = PyUnicode_GET_LENGTH(substr); 
        
               ADJUST_INDICES(start, end, len1); 
        
               if (end - start < len2) 
        
                   return 0; 
        
               buf1 = PyUnicode_DATA(str); 
        
               buf2 = PyUnicode_DATA(substr); 
        
               if (kind2 != kind1) { 
        
                   buf2 = unicode_askind(kind2, buf2, len2, kind1); 
        
                   if (!buf2) 
        
                       goto onError; 
        
               } 
        
               switch (kind1) { 
        
               case PyUnicode_1BYTE_KIND: 
        
                   if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr)) 
        
                       result = asciilib_count( 
        
                           ((const Py_UCS1*)buf1) + start, end - start, 
        
                           buf2, len2, PY_SSIZE_T_MAX 
        
                           ); 
        
                   else 
        
                       result = ucs1lib_count( 
        
                           ((const Py_UCS1*)buf1) + start, end - start, 
        
                           buf2, len2, PY_SSIZE_T_MAX 
        
                           ); 
        
                   break; 
        
               case PyUnicode_2BYTE_KIND: 
        
                   result = ucs2lib_count( 
        
                       ((const Py_UCS2*)buf1) + start, end - start, 
        
                       buf2, len2, PY_SSIZE_T_MAX 
        
                       ); 
        
                   break; 
        
               case PyUnicode_4BYTE_KIND: 
        
                   result = ucs4lib_count( 
        
                       ((const Py_UCS4*)buf1) + start, end - start, 
        
                       buf2, len2, PY_SSIZE_T_MAX 
        
                       ); 
        
                   break; 
        
               default: 
        
                   Py_UNREACHABLE(); 
        
               } 
        
               assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr))); 
        
               if (kind2 != kind1) 
        
                   PyMem_Free((void *)buf2); 
        
               return result; 
        
             onError: 
        
               assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr))); 
        
               if (kind2 != kind1) 
        
                   PyMem_Free((void *)buf2); 
        
               return -1; 
        
           }

And

cpython/Objects/unicodeobject.c

Lines 10854 to 10916 in cbdeda8

    
           static PyObject * 
        
           unicode_count(PyObject *self, PyObject *args) 
        
           { 
        
               PyObject *substring = NULL;   /* initialize to fix a compiler warning */ 
        
               Py_ssize_t start = 0; 
        
               Py_ssize_t end = PY_SSIZE_T_MAX; 
        
               PyObject *result; 
        
               int kind1, kind2; 
        
               const void *buf1, *buf2; 
        
               Py_ssize_t len1, len2, iresult; 
        
               if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) 
        
                   return NULL; 
        
               kind1 = PyUnicode_KIND(self); 
        
               kind2 = PyUnicode_KIND(substring); 
        
               if (kind1 < kind2) 
        
                   return PyLong_FromLong(0); 
        
               len1 = PyUnicode_GET_LENGTH(self); 
        
               len2 = PyUnicode_GET_LENGTH(substring); 
        
               ADJUST_INDICES(start, end, len1); 
        
               if (end - start < len2) 
        
                   return PyLong_FromLong(0); 
        
               buf1 = PyUnicode_DATA(self); 
        
               buf2 = PyUnicode_DATA(substring); 
        
               if (kind2 != kind1) { 
        
                   buf2 = unicode_askind(kind2, buf2, len2, kind1); 
        
                   if (!buf2) 
        
                       return NULL; 
        
               } 
        
               switch (kind1) { 
        
               case PyUnicode_1BYTE_KIND: 
        
                   iresult = ucs1lib_count( 
        
                       ((const Py_UCS1*)buf1) + start, end - start, 
        
                       buf2, len2, PY_SSIZE_T_MAX 
        
                       ); 
        
                   break; 
        
               case PyUnicode_2BYTE_KIND: 
        
                   iresult = ucs2lib_count( 
        
                       ((const Py_UCS2*)buf1) + start, end - start, 
        
                       buf2, len2, PY_SSIZE_T_MAX 
        
                       ); 
        
                   break; 
        
               case PyUnicode_4BYTE_KIND: 
        
                   iresult = ucs4lib_count( 
        
                       ((const Py_UCS4*)buf1) + start, end - start, 
        
                       buf2, len2, PY_SSIZE_T_MAX 
        
                       ); 
        
                   break; 
        
               default: 
        
                   Py_UNREACHABLE(); 
        
               } 
        
               result = PyLong_FromSsize_t(iresult); 
        
               assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring))); 
        
               if (kind2 != kind1) 
        
                   PyMem_Free((void *)buf2); 
        
               return result; 
        
           }

Issue: Metabug: Improving C-level coverage #94808

mdboom · 2022-09-22T19:45:02Z

Question: what is the historical context on why PyUnicode_Count is not reused in unicode_count?

It looks like these both date to the same commit d57fd91 from 2000-03-10. They were pretty different then, but are almost the same now. I see some benefit in making unicode_count call PyUnicode_Count to make sure they remain consistent, but I could also see someone seeing this as "churn for churn's sake".

Note there is also anylib_count which is a subset of unicode_count and PyUnicode_Count.

There are a few other instances of this kind of thing I've come across looking at coverage -- it would be good to get a core developer's take on whether merging internal and external functions where they are clearly wrappable like this would be welcome.

encukou · 2022-10-06T15:24:07Z

Apparently unicode_count missed an optimization in 2011, otherwise they're equivalent (except arg parsing & converting the return value). Merging them could add the optimization to unicode_count.
If you want to work on that, note that there's also anylib_count that duplicates the main switch.

sobolevn · 2022-10-06T17:04:14Z

Thanks! Yes, I would like to do that! I will open a new issue for it.

* main: pythonGH-88050: fix race in closing subprocess pipe in asyncio (python#97951) pythongh-93738: Disallow pre-v3 syntax in the C domain (python#97962) pythongh-95986: Fix the example using match keyword (python#95989) pythongh-97897: Prevent os.mkfifo and os.mknod segfaults with macOS 13 SDK (pythonGH-97944) pythongh-94808: Cover `PyUnicode_Count` in CAPI (python#96929) pythongh-94808: Cover `PyObject_PyBytes` case with custom `__bytes__` method (python#96610) pythongh-95691: Doc BufferedWriter and BufferedReader (python#95703) pythonGH-88968: Add notes about socket ownership transfers (python#97936) pythongh-96865: [Enum] fix Flag to use CONFORM boundary (pythonGH-97528)

* main: (53 commits) pythongh-94808: Coverage: Test that maximum indentation level is handled (python#95926) pythonGH-88050: fix race in closing subprocess pipe in asyncio (python#97951) pythongh-93738: Disallow pre-v3 syntax in the C domain (python#97962) pythongh-95986: Fix the example using match keyword (python#95989) pythongh-97897: Prevent os.mkfifo and os.mknod segfaults with macOS 13 SDK (pythonGH-97944) pythongh-94808: Cover `PyUnicode_Count` in CAPI (python#96929) pythongh-94808: Cover `PyObject_PyBytes` case with custom `__bytes__` method (python#96610) pythongh-95691: Doc BufferedWriter and BufferedReader (python#95703) pythonGH-88968: Add notes about socket ownership transfers (python#97936) pythongh-96865: [Enum] fix Flag to use CONFORM boundary (pythonGH-97528) pythongh-65961: Raise `DeprecationWarning` when `__package__` differs from `__spec__.parent` (python#97879) docs(typing): add "see PEP 675" to LiteralString (python#97926) pythongh-97850: Remove all known instances of module_repr() (python#97876) I changed my surname early this year (python#96671) pythongh-93738: Documentation C syntax (:c:type:<C type> -> :c:expr:<C type>) (python#97768) pythongh-91539: improve performance of get_proxies_environment (python#91566) build(deps): bump actions/stale from 5 to 6 (python#97701) pythonGH-95172 Make the same version `versionadded` oneline (python#95172) pythongh-88050: Fix asyncio subprocess to kill process cleanly when process is blocked (python#32073) pythongh-93738: Documentation C syntax (Function glob patterns -> literal markup) (python#97774) ...

pythongh-94808: Cover PyUnicode_Count in CAPI

22aee9a

bedevere-bot added the awaiting review label Sep 19, 2022

sobolevn added tests Tests in the Lib/test dir skip news labels Sep 19, 2022

sobolevn requested a review from encukou September 19, 2022 09:49

encukou approved these changes Oct 6, 2022

View reviewed changes

bedevere-bot added awaiting merge and removed awaiting review labels Oct 6, 2022

encukou merged commit e63d7da into python:main Oct 6, 2022

bedevere-bot removed the awaiting merge label Oct 6, 2022

sobolevn mentioned this pull request Oct 6, 2022

Unify PyUncode_Count and unicode_count #97982

Closed

mpage pushed a commit to mpage/cpython that referenced this pull request Oct 11, 2022

pythongh-94808: Cover PyUnicode_Count in CAPI (python#96929)

d458682

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

gh-94808: Cover `PyUnicode_Count` in CAPI #96929

gh-94808: Cover `PyUnicode_Count` in CAPI #96929

Uh oh!

sobolevn commented Sep 19, 2022 •

edited by bedevere-bot

Loading

Uh oh!

mdboom commented Sep 22, 2022

Uh oh!

encukou commented Oct 6, 2022

Uh oh!

sobolevn commented Oct 6, 2022

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

	def test_count(self):
	self.checkequal(3, 'aaa', 'count', 'a')
	self.checkequal(0, 'aaa', 'count', 'b')
	self.checkequal(3, 'aaa', 'count', 'a')
	self.checkequal(0, 'aaa', 'count', 'b')
	self.checkequal(3, 'aaa', 'count', 'a')
	self.checkequal(0, 'aaa', 'count', 'b')
	self.checkequal(0, 'aaa', 'count', 'b')
	self.checkequal(2, 'aaa', 'count', 'a', 1)
	self.checkequal(0, 'aaa', 'count', 'a', 10)
	self.checkequal(1, 'aaa', 'count', 'a', -1)
	self.checkequal(3, 'aaa', 'count', 'a', -10)
	self.checkequal(1, 'aaa', 'count', 'a', 0, 1)
	self.checkequal(3, 'aaa', 'count', 'a', 0, 10)
	self.checkequal(2, 'aaa', 'count', 'a', 0, -1)
	self.checkequal(0, 'aaa', 'count', 'a', 0, -10)
	self.checkequal(3, 'aaa', 'count', '', 1)
	self.checkequal(1, 'aaa', 'count', '', 3)
	self.checkequal(0, 'aaa', 'count', '', 10)
	self.checkequal(2, 'aaa', 'count', '', -1)
	self.checkequal(4, 'aaa', 'count', '', -10)

	self.checkequal(1, '', 'count', '')
	self.checkequal(0, '', 'count', '', 1, 1)
	self.checkequal(0, '', 'count', '', sys.maxsize, 0)

	self.checkequal(0, '', 'count', 'xx')
	self.checkequal(0, '', 'count', 'xx', 1, 1)
	self.checkequal(0, '', 'count', 'xx', sys.maxsize, 0)

	self.checkraises(TypeError, 'hello', 'count')

	if self.contains_bytes:
	self.checkequal(0, 'hello', 'count', 42)
	else:
	self.checkraises(TypeError, 'hello', 'count', 42)

	# For a variety of combinations,
	# verify that str.count() matches an equivalent function
	# replacing all occurrences and then differencing the string lengths
	charset = ['', 'a', 'b']
	digits = 7
	base = len(charset)
	teststrings = set()
	for i in range(base ** digits):
	entry = []
	for j in range(digits):
	i, m = divmod(i, base)
	entry.append(charset[m])
	teststrings.add(''.join(entry))
	teststrings = [self.fixtype(ts) for ts in teststrings]
	for i in teststrings:
	n = len(i)
	for j in teststrings:
	r1 = i.count(j)
	if j:
	r2, rem = divmod(n - len(i.replace(j, self.fixtype(''))),
	len(j))
	else:
	r2, rem = len(i)+1, 0
	if rem or r1 != r2:
	self.assertEqual(rem, 0, '%s != 0 for %s' % (rem, i))
	self.assertEqual(r1, r2, '%s != %s for %s' % (r1, r2, i))

	Py_ssize_t
	PyUnicode_Count(PyObject *str,
	PyObject *substr,
	Py_ssize_t start,
	Py_ssize_t end)
	{
	Py_ssize_t result;
	int kind1, kind2;
	const void buf1 = NULL, buf2 = NULL;
	Py_ssize_t len1, len2;

	if (ensure_unicode(str) < 0 \|\| ensure_unicode(substr) < 0)
	return -1;

	kind1 = PyUnicode_KIND(str);
	kind2 = PyUnicode_KIND(substr);
	if (kind1 < kind2)
	return 0;

	len1 = PyUnicode_GET_LENGTH(str);
	len2 = PyUnicode_GET_LENGTH(substr);
	ADJUST_INDICES(start, end, len1);
	if (end - start < len2)
	return 0;

	buf1 = PyUnicode_DATA(str);
	buf2 = PyUnicode_DATA(substr);
	if (kind2 != kind1) {
	buf2 = unicode_askind(kind2, buf2, len2, kind1);
	if (!buf2)
	goto onError;
	}

	switch (kind1) {
	case PyUnicode_1BYTE_KIND:
	if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
	result = asciilib_count(
	((const Py_UCS1*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	else
	result = ucs1lib_count(
	((const Py_UCS1*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	case PyUnicode_2BYTE_KIND:
	result = ucs2lib_count(
	((const Py_UCS2*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	case PyUnicode_4BYTE_KIND:
	result = ucs4lib_count(
	((const Py_UCS4*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	default:
	Py_UNREACHABLE();
	}

	assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
	if (kind2 != kind1)
	PyMem_Free((void *)buf2);

	return result;
	onError:
	assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
	if (kind2 != kind1)
	PyMem_Free((void *)buf2);
	return -1;
	}

	static PyObject *
	unicode_count(PyObject self, PyObject args)
	{
	PyObject substring = NULL; / initialize to fix a compiler warning */
	Py_ssize_t start = 0;
	Py_ssize_t end = PY_SSIZE_T_MAX;
	PyObject *result;
	int kind1, kind2;
	const void buf1, buf2;
	Py_ssize_t len1, len2, iresult;

	if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
	return NULL;

	kind1 = PyUnicode_KIND(self);
	kind2 = PyUnicode_KIND(substring);
	if (kind1 < kind2)
	return PyLong_FromLong(0);

	len1 = PyUnicode_GET_LENGTH(self);
	len2 = PyUnicode_GET_LENGTH(substring);
	ADJUST_INDICES(start, end, len1);
	if (end - start < len2)
	return PyLong_FromLong(0);

	buf1 = PyUnicode_DATA(self);
	buf2 = PyUnicode_DATA(substring);
	if (kind2 != kind1) {
	buf2 = unicode_askind(kind2, buf2, len2, kind1);
	if (!buf2)
	return NULL;
	}
	switch (kind1) {
	case PyUnicode_1BYTE_KIND:
	iresult = ucs1lib_count(
	((const Py_UCS1*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	case PyUnicode_2BYTE_KIND:
	iresult = ucs2lib_count(
	((const Py_UCS2*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	case PyUnicode_4BYTE_KIND:
	iresult = ucs4lib_count(
	((const Py_UCS4*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	default:
	Py_UNREACHABLE();
	}

	result = PyLong_FromSsize_t(iresult);

	assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
	if (kind2 != kind1)
	PyMem_Free((void *)buf2);

	return result;
	}

Uh oh!

gh-94808: Cover PyUnicode_Count in CAPI #96929

gh-94808: Cover PyUnicode_Count in CAPI #96929

Uh oh!

Conversation

sobolevn commented Sep 19, 2022 • edited by bedevere-bot Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

mdboom commented Sep 22, 2022

Uh oh!

encukou commented Oct 6, 2022

Uh oh!

sobolevn commented Oct 6, 2022

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

gh-94808: Cover `PyUnicode_Count` in CAPI #96929

gh-94808: Cover `PyUnicode_Count` in CAPI #96929

sobolevn commented Sep 19, 2022 •

edited by bedevere-bot

Loading