Intellipaat Back

Explore Courses Blog Tutorials Interview Questions
0 votes
2 views
in Data Science by (17.6k points)

First off, thanks in advance if you can help puzzle this out! I'm trying to balance some customer data for my model. My targets are all 1s and 0s, and the 0s are overwhelmingly abundant. So I created a counter that will start to delete the 0 rows once they surpass the number of 1 rows. But at the very end of my code, when I create the np.delete to get those extra rows off my dataset I keep getting this error

I don't really know what to try, because I don't even understand what the error is telling me

import pandas as pd 

import numpy as np 

from sklearn import preprocessing

#%%

#Loading the Raw Data

raw_csv_data= pd.read_csv('Audiobooks-data_raw.csv')

print(display(raw_csv_data.head(20)))

#%%

df=raw_csv_data.copy()

print(display(df.head(20)))

#%%

print(df.info())

#%%

#Separate the Targets from the dataset

inputs_all= df.loc[:,'Book length (mins)_overall':'Last visited minus Purchase date']

targets_all= df['Targets']

print(display(inputs_all.head()))

print(display(targets_all.head()))

#%%

#Shuffling the Data to prep for balancing

shuffled_indices= np.arange(inputs_all.shape[0])

np.random.shuffle(shuffled_indices)

shuffled_inputs= inputs_all.iloc[shuffled_indices]

shuffled_targets= targets_all[shuffled_indices]

#%%

#Balance the Dataset

#There are significantly more 0's than 1's in our target.

#We want a good accurate model

print(inputs_all.shape)

print(targets_all.shape)

#%%

num_one_targets= int(np.sum(targets_all))

zero_targets_counter= 0

indices_to_remove= []

print(num_one_targets)

#%%

for i in range(targets_all.shape[0]):

    if targets_all[i]==0:

        zero_targets_counter +=1

        if zero_targets_counter> num_one_targets:

            indices_to_remove.append(i)

#%%

inputs_all_balanced= np.delete(inputs_all, indices_to_remove, axis=0)

targets_all_balanced= np.delete(targets_all, indices_to_remove, axis=0)

Everything works except when I try to group my balanced datasets and delete the excess 0 rows. Here is the error:

ValueError                                Traceback (most recent call last)

~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes)

   1652 

-> 1653         mgr = BlockManager(blocks, axes)

   1654         mgr._consolidate_inplace()

~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in __init__(self, blocks, axes, do_integrity_check)

    113         if do_integrity_check:

--> 114             self._verify_integrity()

    115 

~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in _verify_integrity(self)

    310             if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:

--> 311                 construction_error(tot_items, block.shape[1:], self.axes)

    312         if len(self.items) != tot_items:

~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in construction_error(tot_items, block_shape, axes, e)

   1690     raise ValueError("Shape of passed values is {0}, indices imply {1}".format(

-> 1691         passed, implied))

   1692 

ValueError: Shape of passed values is (4474, 10), indices imply (14084, 10)

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)

 in 

----> 1 inputs_all_balanced= np.delete(inputs_all, indices_to_remove, axis=0)

      2 targets_all_balanced= np.delete(targets_all, indices_to_remove, axis=0)

~\Anaconda3\lib\site-packages\numpy\lib\function_base.py in delete(arr, obj, axis)

   4419 

   4420     if wrap:

-> 4421         return wrap(new)

   4422     else:

   4423         return new

~\Anaconda3\lib\site-packages\pandas\core\generic.py in __array_wrap__(self, result, context)

   1907     def __array_wrap__(self, result, context=None):

   1908         d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)

-> 1909         return self._constructor(result, **d).__finalize__(self)

   1910 

   1911     # ideally we would define this to avoid the getattr checks, but

~\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)

    422             else:

    423                 mgr = init_ndarray(data, index, columns, dtype=dtype,

--> 424                                    copy=copy)

    425 

    426         # For data is list-like, or Iterable (will consume into list)

~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_ndarray(values, index, columns, dtype, copy)

    165         values = maybe_infer_to_datetimelike(values)

    166 

--> 167     return create_block_manager_from_blocks([values], [columns, index])

    168 

    169 

~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes)

   1658         blocks = [getattr(b, 'values', b) for b in blocks]

   1659         tot_items = sum(b.shape[0] for b in blocks)

-> 1660         construction_error(tot_items, blocks[0].shape[1:], axes, e)

   1661 

   1662 

~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in construction_error(tot_items, block_shape, axes, e)

   1689         raise ValueError("Empty data passed with indices specified.")

   1690     raise ValueError("Shape of passed values is {0}, indices imply {1}".format(

-> 1691         passed, implied))

   1692 

   1693 

ValueError: Shape of passed values is (4474, 10), indices imply (14084, 10)

1 Answer

0 votes
by (41.4k points)

Use pandas drop:

inputs_all_balanced  = inputs_all.drop(indices_to_remove,axis=0)

targets_all_balanced = targets_all.drop(indices_to_remove,axis=0)

31k questions

32.8k answers

501 comments

693 users

Browse Categories

...