From c64ef05aa53778f899c28ed59e31ff2eae254797 Mon Sep 17 00:00:00 2001 From: Vikram Kumar Date: Wed, 17 Dec 2025 16:39:02 +0530 Subject: [PATCH 01/14] BUG: fix empty suffix and prefix handling in pyarrow string methods Python's `str.removeprefix("")` and `str.removesuffix("")` return the original string. The current pyarrow-backed implementation slices with `stop=0` or `start=0` when the prefix or suffix is empty, which can result in unexpected behavior instead of preserving the original values. This PR adds explicit guards for empty prefix and suffix inputs and includes tests to ensure parity with Python semantics. --- pandas/core/arrays/_arrow_string_mixins.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index e5e8ffe409788..0b2f90f7a1581 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -209,11 +209,14 @@ def _str_removeprefix(self, prefix: str): return self._from_pyarrow_array(result) def _str_removesuffix(self, suffix: str): + if suffix == "": + return self ends_with = pc.ends_with(self._pa_array, pattern=suffix) removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) result = pc.if_else(ends_with, removed, self._pa_array) return self._from_pyarrow_array(result) + def _str_startswith( self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ): From 74fb4f93c2490c7e87554398ff77b9dcf6e48521 Mon Sep 17 00:00:00 2001 From: Vikram Kumar Date: Wed, 17 Dec 2025 18:47:17 +0530 Subject: [PATCH 02/14] BUG: add tests for empty prefix and suffix --- pandas/tests/strings/test_strings.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index e07ece91090df..64138667e8f18 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -544,7 +544,12 @@ def test_strip_lstrip_rstrip_args(any_string_dtype, method, exp): @pytest.mark.parametrize( - "prefix, expected", [("a", ["b", " b c", "bc"]), ("ab", ["", "a b c", "bc"])] + "prefix, expected", + [ + ("a", ["b", " b c", "bc"]), + ("ab", ["", "a b c", "bc"]), + ("", ["ab", "a b c", "bc"]), + ], ) def test_removeprefix(any_string_dtype, prefix, expected): ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype) @@ -554,7 +559,12 @@ def test_removeprefix(any_string_dtype, prefix, expected): @pytest.mark.parametrize( - "suffix, expected", [("c", ["ab", "a b ", "b"]), ("bc", ["ab", "a b c", ""])] + "suffix, expected", + [ + ("c", ["ab", "a b ", "b"]), + ("bc", ["ab", "a b c", ""]), + ("", ["ab", "a b c", "bc"]), + ], ) def test_removesuffix(any_string_dtype, suffix, expected): ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype) @@ -563,6 +573,7 @@ def test_removesuffix(any_string_dtype, suffix, expected): tm.assert_series_equal(result, ser_expected) + def test_string_slice_get_syntax(any_string_dtype): ser = Series( ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt"], From a1da54e73098973a5f548a30745065717fb7d287 Mon Sep 17 00:00:00 2001 From: Vikram Kumar Date: Wed, 17 Dec 2025 19:01:38 +0530 Subject: [PATCH 03/14] BUG: handle empty prefix in pyarrow-backed removeprefix --- pandas/core/arrays/_arrow_string_mixins.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 0b2f90f7a1581..566a05b0b2489 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -203,6 +203,8 @@ def _str_swapcase(self) -> Self: return self._from_pyarrow_array(pc.utf8_swapcase(self._pa_array)) def _str_removeprefix(self, prefix: str): + if prefix == "": + return self starts_with = pc.starts_with(self._pa_array, pattern=prefix) removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) result = pc.if_else(starts_with, removed, self._pa_array) From f71a75c19080a5335da132a8dc0c1977fc543e71 Mon Sep 17 00:00:00 2001 From: Vikram Kumar Date: Wed, 17 Dec 2025 20:55:06 +0530 Subject: [PATCH 04/14] BUG: fix empty prefix/suffix handling in pyarrow-backed string methods --- pandas/core/arrays/_arrow_string_mixins.py | 44 +++++++++++++++------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 566a05b0b2489..7462a0f5b22c0 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -202,21 +202,37 @@ def _str_title(self) -> Self: def _str_swapcase(self) -> Self: return self._from_pyarrow_array(pc.utf8_swapcase(self._pa_array)) - def _str_removeprefix(self, prefix: str): - if prefix == "": - return self - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return self._from_pyarrow_array(result) +def _str_removeprefix(self, prefix: str): + if prefix == "": + return self + + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else( + starts_with, + removed, + self._pa_array, + ) + return self._from_pyarrow_array(result) + + +def _str_removesuffix(self, suffix: str): + if suffix == "": + return self + + ends_with = pc.ends_with(self._pa_array, pattern=suffix) + removed = pc.utf8_slice_codeunits( + self._pa_array, + 0, + stop=-len(suffix), + ) + result = pc.if_else( + ends_with, + removed, + self._pa_array, + ) + return self._from_pyarrow_array(result) - def _str_removesuffix(self, suffix: str): - if suffix == "": - return self - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._pa_array) - return self._from_pyarrow_array(result) def _str_startswith( From 2ec810d5bdd0bd7fb3b19909f2b07f0071ec981c Mon Sep 17 00:00:00 2001 From: Vikram Kumar Date: Wed, 17 Dec 2025 20:58:28 +0530 Subject: [PATCH 05/14] BUG: fix removeprefix/removesuffix for empty strings in pyarrow backend --- pandas/tests/strings/test_strings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 64138667e8f18..30cf40da65859 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -574,6 +574,7 @@ def test_removesuffix(any_string_dtype, suffix, expected): + def test_string_slice_get_syntax(any_string_dtype): ser = Series( ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt"], From 9a526eaceaca29cc943377879a10e2e7ff2ca5e9 Mon Sep 17 00:00:00 2001 From: Vikram Kumar Date: Wed, 17 Dec 2025 21:23:14 +0530 Subject: [PATCH 06/14] Fix for failing cases --- pandas/core/arrays/_arrow_string_mixins.py | 26 ++++------------------ 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 7462a0f5b22c0..b6947f6335e95 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -201,40 +201,22 @@ def _str_title(self) -> Self: def _str_swapcase(self) -> Self: return self._from_pyarrow_array(pc.utf8_swapcase(self._pa_array)) - def _str_removeprefix(self, prefix: str): - if prefix == "": - return self - starts_with = pc.starts_with(self._pa_array, pattern=prefix) removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else( - starts_with, - removed, - self._pa_array, - ) + result = pc.if_else(starts_with, removed, self._pa_array) return self._from_pyarrow_array(result) def _str_removesuffix(self, suffix: str): - if suffix == "": - return self - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits( - self._pa_array, - 0, - stop=-len(suffix), - ) - result = pc.if_else( - ends_with, - removed, - self._pa_array, - ) + removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) + result = pc.if_else(ends_with, removed, self._pa_array) return self._from_pyarrow_array(result) + def _str_startswith( self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ): From e733bfb3a90743dcfe7c1da77d784d1b4a41fe78 Mon Sep 17 00:00:00 2001 From: Vikram Kumar Date: Wed, 17 Dec 2025 21:25:56 +0530 Subject: [PATCH 07/14] fix for failing tests --- pandas/core/arrays/_arrow_string_mixins.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index b6947f6335e95..8986fa8f3306e 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -202,6 +202,8 @@ def _str_title(self) -> Self: def _str_swapcase(self) -> Self: return self._from_pyarrow_array(pc.utf8_swapcase(self._pa_array)) def _str_removeprefix(self, prefix: str): + if prefix == "": + return self starts_with = pc.starts_with(self._pa_array, pattern=prefix) removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) result = pc.if_else(starts_with, removed, self._pa_array) @@ -209,6 +211,8 @@ def _str_removeprefix(self, prefix: str): def _str_removesuffix(self, suffix: str): + if suffix == "": + return self ends_with = pc.ends_with(self._pa_array, pattern=suffix) removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) result = pc.if_else(ends_with, removed, self._pa_array) @@ -217,6 +221,7 @@ def _str_removesuffix(self, suffix: str): + def _str_startswith( self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ): From f8d2c6f7af61aebf79e1ad1af119f161d60ae924 Mon Sep 17 00:00:00 2001 From: Vikram Kumar Date: Thu, 18 Dec 2025 09:50:03 +0530 Subject: [PATCH 08/14] style: fix indentation --- pandas/core/arrays/_arrow_string_mixins.py | 33 +++++++++++----------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 8986fa8f3306e..1b96a01db40b4 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -201,22 +201,23 @@ def _str_title(self) -> Self: def _str_swapcase(self) -> Self: return self._from_pyarrow_array(pc.utf8_swapcase(self._pa_array)) -def _str_removeprefix(self, prefix: str): - if prefix == "": - return self - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return self._from_pyarrow_array(result) - - -def _str_removesuffix(self, suffix: str): - if suffix == "": - return self - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._pa_array) - return self._from_pyarrow_array(result) + + def _str_removeprefix(self, prefix: str): + if prefix == "": + return self + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return self._from_pyarrow_array(result) + + + def _str_removesuffix(self, suffix: str): + if suffix == "": + return self + ends_with = pc.ends_with(self._pa_array, pattern=suffix) + removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) + result = pc.if_else(ends_with, removed, self._pa_array) + return self._from_pyarrow_array(result) From 2977a045c50560d0c673a3e4f8f37a4813fe782c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 18 Dec 2025 09:10:50 +0100 Subject: [PATCH 09/14] Apply suggestion from @jorisvandenbossche --- pandas/core/arrays/_arrow_string_mixins.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 1b96a01db40b4..cc9089fc825bf 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -210,7 +210,6 @@ def _str_removeprefix(self, prefix: str): result = pc.if_else(starts_with, removed, self._pa_array) return self._from_pyarrow_array(result) - def _str_removesuffix(self, suffix: str): if suffix == "": return self From 142a6098078b26a16198843ee048cbe9e2e40f2c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 18 Dec 2025 09:10:57 +0100 Subject: [PATCH 10/14] Apply suggestion from @jorisvandenbossche --- pandas/core/arrays/_arrow_string_mixins.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index cc9089fc825bf..2610112052604 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -218,10 +218,6 @@ def _str_removesuffix(self, suffix: str): result = pc.if_else(ends_with, removed, self._pa_array) return self._from_pyarrow_array(result) - - - - def _str_startswith( self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ): From 78af6fa3de1058c29c737f7f878a679f285e6544 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 18 Dec 2025 09:11:05 +0100 Subject: [PATCH 11/14] Apply suggestion from @jorisvandenbossche --- pandas/tests/strings/test_strings.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 30cf40da65859..5873800794d49 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -573,8 +573,6 @@ def test_removesuffix(any_string_dtype, suffix, expected): tm.assert_series_equal(result, ser_expected) - - def test_string_slice_get_syntax(any_string_dtype): ser = Series( ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt"], From 1cdc3dff5becc0465b8368976199b7d6cc8fb675 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 18 Dec 2025 09:36:35 +0100 Subject: [PATCH 12/14] fixup style --- pandas/core/arrays/_arrow_string_mixins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 2610112052604..bad7a0a9869d0 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -201,7 +201,7 @@ def _str_title(self) -> Self: def _str_swapcase(self) -> Self: return self._from_pyarrow_array(pc.utf8_swapcase(self._pa_array)) - + def _str_removeprefix(self, prefix: str): if prefix == "": return self From 2f302d4184b4f30cdea7063de9a25d982295a9a7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 18 Dec 2025 09:44:05 +0100 Subject: [PATCH 13/14] return shallow copy --- pandas/core/arrays/_arrow_string_mixins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index bad7a0a9869d0..946edb1a8422a 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -204,7 +204,7 @@ def _str_swapcase(self) -> Self: def _str_removeprefix(self, prefix: str): if prefix == "": - return self + return self.copy() starts_with = pc.starts_with(self._pa_array, pattern=prefix) removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) result = pc.if_else(starts_with, removed, self._pa_array) @@ -212,7 +212,7 @@ def _str_removeprefix(self, prefix: str): def _str_removesuffix(self, suffix: str): if suffix == "": - return self + return self.copy() ends_with = pc.ends_with(self._pa_array, pattern=suffix) removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) result = pc.if_else(ends_with, removed, self._pa_array) From 4def7face93913754d2b58e6ee1cd8eb67a8d399 Mon Sep 17 00:00:00 2001 From: Vikram Kumar Date: Thu, 18 Dec 2025 16:49:07 +0530 Subject: [PATCH 14/14] Fix type check error by avoiding copy() in ArrowStringArrayMixin --- pandas/core/arrays/_arrow_string_mixins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 946edb1a8422a..4e32db15b392f 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -204,7 +204,7 @@ def _str_swapcase(self) -> Self: def _str_removeprefix(self, prefix: str): if prefix == "": - return self.copy() + return self._from_pyarrow_array(self._pa_array) starts_with = pc.starts_with(self._pa_array, pattern=prefix) removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) result = pc.if_else(starts_with, removed, self._pa_array) @@ -212,7 +212,7 @@ def _str_removeprefix(self, prefix: str): def _str_removesuffix(self, suffix: str): if suffix == "": - return self.copy() + return self._from_pyarrow_array(self._pa_array) ends_with = pc.ends_with(self._pa_array, pattern=suffix) removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) result = pc.if_else(ends_with, removed, self._pa_array)