Issue
My DataFrame looks like this:
Now i try to use the following command:
car_sales['Price'] = car_sales['Price'].str.replace('[\$\,\.]', '').astype(int)
As well as
car_sales['Price'] = car_sales['Price'].astype(str).str.replace('[\$\,\.]', '').astype(int)
But i get the following error:
ValueError Traceback (most recent call last)
Cell In[87], line 1
----> 1 car_sales['Price'] = car_sales['Price'].astype(str).str.replace('[\$\,\.]', '').astype(int)
2 car_sales
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\generic.py:6324, in NDFrame.astype(self, dtype, copy, errors)
6317 results = [
6318 self.iloc[:, i].astype(dtype, copy=copy)
6319 for i in range(len(self.columns))
6320 ]
6322 else:
6323 # else, only a single dtype is given
-> 6324 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
6325 return self._constructor(new_data).__finalize__(self, method="astype")
6327 # GH 33113: handle empty frame or series
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\internals\managers.py:451, in BaseBlockManager.astype(self, dtype, copy, errors)
448 elif using_copy_on_write():
449 copy = False
--> 451 return self.apply(
452 "astype",
453 dtype=dtype,
454 copy=copy,
455 errors=errors,
456 using_cow=using_copy_on_write(),
457 )
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\internals\managers.py:352, in BaseBlockManager.apply(self, f, align_keys, **kwargs)
350 applied = b.apply(f, **kwargs)
351 else:
--> 352 applied = getattr(b, f)(**kwargs)
353 result_blocks = extend_blocks(applied, result_blocks)
355 out = type(self).from_blocks(result_blocks, self.axes)
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\internals\blocks.py:511, in Block.astype(self, dtype, copy, errors, using_cow)
491 """
492 Coerce to the new dtype.
493
(...)
507 Block
508 """
509 values = self.values
--> 511 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
513 new_values = maybe_coerce_values(new_values)
515 refs = None
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\dtypes\astype.py:242, in astype_array_safe(values, dtype, copy, errors)
239 dtype = dtype.numpy_dtype
241 try:
--> 242 new_values = astype_array(values, dtype, copy=copy)
243 except (ValueError, TypeError):
244 # e.g. _astype_nansafe can fail on object-dtype of strings
245 # trying to convert to float
246 if errors == "ignore":
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\dtypes\astype.py:187, in astype_array(values, dtype, copy)
184 values = values.astype(dtype, copy=copy)
186 else:
--> 187 values = _astype_nansafe(values, dtype, copy=copy)
189 # in pandas we don't store numpy str dtypes, so convert to object
190 if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):
File ~\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\dtypes\astype.py:138, in _astype_nansafe(arr, dtype, copy, skipna)
134 raise ValueError(msg)
136 if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype):
137 # Explicit copy, or required since NumPy can't view from / to object.
--> 138 return arr.astype(dtype, copy=True)
140 return arr.astype(dtype, copy=copy)
ValueError: invalid literal for int() with base 10: '$4,000.00'
Solution
You have to set regex=True
as parameter of str.replace
. However, why do you remove the decimal point? Convert as float
instead of int
or chain both or use pd.to_numeric
with downcast
.
car_sales['P1'] = car_sales['Price'].str.replace('[\$\,]', '', regex=True).astype(float)
car_sales['P2'] = car_sales['Price'].str.replace('[\$\,]', '', regex=True).astype(float).astype(int)
car_sales['P3'] = pd.to_numeric(car_sales['Price'].str.replace('[\$\,]', '', regex=True), downcast='integer')
Output:
>>> car_sales
Price P1 P2 P3
0 $4,000.00 4000.0 4000 4000
1 $5,000.00 5000.0 5000 5000
Obviously, if you also want to remove the decimal point as described by @wjandrea, you can simply modify the regex:
car_sales['P4'] = pd.to_numeric(car_sales['Price'].str.replace('[\$\,\.]', '', regex=True), downcast='integer')
Output:
>>> car_sales
Price P1 P2 P3 P4
0 $4,000.00 4000.0 4000 4000 400000
1 $5,000.00 5000.0 5000 5000 500000
Answered By - Corralien
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.