Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-94808: Cover PyUnicode_Count in CAPI #96929

Merged
merged 1 commit into from
Oct 6, 2022

Conversation

sobolevn
Copy link
Member

@sobolevn sobolevn commented Sep 19, 2022

It is heavily inspired by

def test_count(self):
self.checkequal(3, 'aaa', 'count', 'a')
self.checkequal(0, 'aaa', 'count', 'b')
self.checkequal(3, 'aaa', 'count', 'a')
self.checkequal(0, 'aaa', 'count', 'b')
self.checkequal(3, 'aaa', 'count', 'a')
self.checkequal(0, 'aaa', 'count', 'b')
self.checkequal(0, 'aaa', 'count', 'b')
self.checkequal(2, 'aaa', 'count', 'a', 1)
self.checkequal(0, 'aaa', 'count', 'a', 10)
self.checkequal(1, 'aaa', 'count', 'a', -1)
self.checkequal(3, 'aaa', 'count', 'a', -10)
self.checkequal(1, 'aaa', 'count', 'a', 0, 1)
self.checkequal(3, 'aaa', 'count', 'a', 0, 10)
self.checkequal(2, 'aaa', 'count', 'a', 0, -1)
self.checkequal(0, 'aaa', 'count', 'a', 0, -10)
self.checkequal(3, 'aaa', 'count', '', 1)
self.checkequal(1, 'aaa', 'count', '', 3)
self.checkequal(0, 'aaa', 'count', '', 10)
self.checkequal(2, 'aaa', 'count', '', -1)
self.checkequal(4, 'aaa', 'count', '', -10)
self.checkequal(1, '', 'count', '')
self.checkequal(0, '', 'count', '', 1, 1)
self.checkequal(0, '', 'count', '', sys.maxsize, 0)
self.checkequal(0, '', 'count', 'xx')
self.checkequal(0, '', 'count', 'xx', 1, 1)
self.checkequal(0, '', 'count', 'xx', sys.maxsize, 0)
self.checkraises(TypeError, 'hello', 'count')
if self.contains_bytes:
self.checkequal(0, 'hello', 'count', 42)
else:
self.checkraises(TypeError, 'hello', 'count', 42)
# For a variety of combinations,
# verify that str.count() matches an equivalent function
# replacing all occurrences and then differencing the string lengths
charset = ['', 'a', 'b']
digits = 7
base = len(charset)
teststrings = set()
for i in range(base ** digits):
entry = []
for j in range(digits):
i, m = divmod(i, base)
entry.append(charset[m])
teststrings.add(''.join(entry))
teststrings = [self.fixtype(ts) for ts in teststrings]
for i in teststrings:
n = len(i)
for j in teststrings:
r1 = i.count(j)
if j:
r2, rem = divmod(n - len(i.replace(j, self.fixtype(''))),
len(j))
else:
r2, rem = len(i)+1, 0
if rem or r1 != r2:
self.assertEqual(rem, 0, '%s != 0 for %s' % (rem, i))
self.assertEqual(r1, r2, '%s != %s for %s' % (r1, r2, i))

Question: what is the historical context on why PyUnicode_Count is not reused in unicode_count? They look pretty similar:

Py_ssize_t
PyUnicode_Count(PyObject *str,
PyObject *substr,
Py_ssize_t start,
Py_ssize_t end)
{
Py_ssize_t result;
int kind1, kind2;
const void *buf1 = NULL, *buf2 = NULL;
Py_ssize_t len1, len2;
if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
return -1;
kind1 = PyUnicode_KIND(str);
kind2 = PyUnicode_KIND(substr);
if (kind1 < kind2)
return 0;
len1 = PyUnicode_GET_LENGTH(str);
len2 = PyUnicode_GET_LENGTH(substr);
ADJUST_INDICES(start, end, len1);
if (end - start < len2)
return 0;
buf1 = PyUnicode_DATA(str);
buf2 = PyUnicode_DATA(substr);
if (kind2 != kind1) {
buf2 = unicode_askind(kind2, buf2, len2, kind1);
if (!buf2)
goto onError;
}
switch (kind1) {
case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
result = asciilib_count(
((const Py_UCS1*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
);
else
result = ucs1lib_count(
((const Py_UCS1*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
case PyUnicode_2BYTE_KIND:
result = ucs2lib_count(
((const Py_UCS2*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
case PyUnicode_4BYTE_KIND:
result = ucs4lib_count(
((const Py_UCS4*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
default:
Py_UNREACHABLE();
}
assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
if (kind2 != kind1)
PyMem_Free((void *)buf2);
return result;
onError:
assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
if (kind2 != kind1)
PyMem_Free((void *)buf2);
return -1;
}

And

cpython/Objects/unicodeobject.c

Lines 10854 to 10916 in cbdeda8

static PyObject *
unicode_count(PyObject *self, PyObject *args)
{
PyObject *substring = NULL; /* initialize to fix a compiler warning */
Py_ssize_t start = 0;
Py_ssize_t end = PY_SSIZE_T_MAX;
PyObject *result;
int kind1, kind2;
const void *buf1, *buf2;
Py_ssize_t len1, len2, iresult;
if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
return NULL;
kind1 = PyUnicode_KIND(self);
kind2 = PyUnicode_KIND(substring);
if (kind1 < kind2)
return PyLong_FromLong(0);
len1 = PyUnicode_GET_LENGTH(self);
len2 = PyUnicode_GET_LENGTH(substring);
ADJUST_INDICES(start, end, len1);
if (end - start < len2)
return PyLong_FromLong(0);
buf1 = PyUnicode_DATA(self);
buf2 = PyUnicode_DATA(substring);
if (kind2 != kind1) {
buf2 = unicode_askind(kind2, buf2, len2, kind1);
if (!buf2)
return NULL;
}
switch (kind1) {
case PyUnicode_1BYTE_KIND:
iresult = ucs1lib_count(
((const Py_UCS1*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
case PyUnicode_2BYTE_KIND:
iresult = ucs2lib_count(
((const Py_UCS2*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
case PyUnicode_4BYTE_KIND:
iresult = ucs4lib_count(
((const Py_UCS4*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
default:
Py_UNREACHABLE();
}
result = PyLong_FromSsize_t(iresult);
assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
if (kind2 != kind1)
PyMem_Free((void *)buf2);
return result;
}

@sobolevn sobolevn added tests Tests in the Lib/test dir skip news labels Sep 19, 2022
@sobolevn sobolevn requested a review from encukou September 19, 2022 09:49
@mdboom
Copy link
Contributor

mdboom commented Sep 22, 2022

Question: what is the historical context on why PyUnicode_Count is not reused in unicode_count?

It looks like these both date to the same commit d57fd91 from 2000-03-10. They were pretty different then, but are almost the same now. I see some benefit in making unicode_count call PyUnicode_Count to make sure they remain consistent, but I could also see someone seeing this as "churn for churn's sake".

Note there is also anylib_count which is a subset of unicode_count and PyUnicode_Count.

There are a few other instances of this kind of thing I've come across looking at coverage -- it would be good to get a core developer's take on whether merging internal and external functions where they are clearly wrappable like this would be welcome.

@encukou
Copy link
Member

encukou commented Oct 6, 2022

Apparently unicode_count missed an optimization in 2011, otherwise they're equivalent (except arg parsing & converting the return value). Merging them could add the optimization to unicode_count.
If you want to work on that, note that there's also anylib_count that duplicates the main switch.

@sobolevn
Copy link
Member Author

sobolevn commented Oct 6, 2022

Thanks! Yes, I would like to do that! I will open a new issue for it.

carljm added a commit to carljm/cpython that referenced this pull request Oct 6, 2022
* main:
  pythonGH-88050: fix race in closing subprocess pipe in asyncio  (python#97951)
  pythongh-93738: Disallow pre-v3 syntax in the C domain (python#97962)
  pythongh-95986: Fix the example using match keyword (python#95989)
  pythongh-97897: Prevent os.mkfifo and os.mknod segfaults with macOS 13 SDK (pythonGH-97944)
  pythongh-94808: Cover `PyUnicode_Count` in CAPI (python#96929)
  pythongh-94808: Cover `PyObject_PyBytes` case with custom `__bytes__` method (python#96610)
  pythongh-95691: Doc BufferedWriter and BufferedReader (python#95703)
  pythonGH-88968: Add notes about socket ownership transfers (python#97936)
  pythongh-96865: [Enum] fix Flag to use CONFORM boundary (pythonGH-97528)
carljm added a commit to carljm/cpython that referenced this pull request Oct 8, 2022
* main: (53 commits)
  pythongh-94808: Coverage: Test that maximum indentation level is handled (python#95926)
  pythonGH-88050: fix race in closing subprocess pipe in asyncio  (python#97951)
  pythongh-93738: Disallow pre-v3 syntax in the C domain (python#97962)
  pythongh-95986: Fix the example using match keyword (python#95989)
  pythongh-97897: Prevent os.mkfifo and os.mknod segfaults with macOS 13 SDK (pythonGH-97944)
  pythongh-94808: Cover `PyUnicode_Count` in CAPI (python#96929)
  pythongh-94808: Cover `PyObject_PyBytes` case with custom `__bytes__` method (python#96610)
  pythongh-95691: Doc BufferedWriter and BufferedReader (python#95703)
  pythonGH-88968: Add notes about socket ownership transfers (python#97936)
  pythongh-96865: [Enum] fix Flag to use CONFORM boundary (pythonGH-97528)
  pythongh-65961: Raise `DeprecationWarning` when `__package__` differs from `__spec__.parent` (python#97879)
  docs(typing): add "see PEP 675" to LiteralString (python#97926)
  pythongh-97850: Remove all known instances of module_repr() (python#97876)
  I changed my surname early this year (python#96671)
  pythongh-93738: Documentation C syntax (:c:type:<C type> -> :c:expr:<C type>) (python#97768)
  pythongh-91539: improve performance of get_proxies_environment  (python#91566)
  build(deps): bump actions/stale from 5 to 6 (python#97701)
  pythonGH-95172 Make the same version `versionadded` oneline (python#95172)
  pythongh-88050: Fix asyncio subprocess to kill process cleanly when process is blocked (python#32073)
  pythongh-93738: Documentation C syntax (Function glob patterns -> literal markup) (python#97774)
  ...
mpage pushed a commit to mpage/cpython that referenced this pull request Oct 11, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
skip news tests Tests in the Lib/test dir
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants