Back

Explore Courses Blog Tutorials Interview Questions
0 votes
2 views
in Data Science by (18.4k points)
edited by

I try to update a MultiIndex-column data frame, the following df, by masking some values. I do not manage to find the proper syntax. Is there a way to reindex states_df in order to have two columns level as well? Or is there a simple way?

# -*- coding: utf-8 -*-

"""

Created on Thu Jul  2 18:31:31 2020

@author: ancollet

"""

import numpy as np

import pandas as pd

def generate_series():

    return pd.Series(np.random.randn(1, 5)[0], [1, 2, 3, 4, 5])

# initial labels

iterables = [['U', 'acidity', 'Al'], ['TSU16_PR']]

# transform it to tuples

columns = pd.MultiIndex.from_product(iterables, names=['elment', 'asset'])

# build a multi-index from it

df = pd.DataFrame(columns=columns)

# Add data

df['U', 'TSU16_PR'] = generate_series()

df['acidity', 'TSU16_PR'] = generate_series()

df['Al', 'TSU16_PR'] = generate_series()

df['U', 'TSU17_PR'] = generate_series()

df['U', 'TSU18_PR'] = generate_series()

states_df = pd.DataFrame([[0.0, 1.0, 0.0],

                          [1.0, 1.0, 1.0],

                          [1.0, 0.0, 1.0],

                          [1.0, 1.0, 1.0],

                          [0.0, 1.0, 1.0]],

                         columns=['TSU16_PR', 'TSU17_PR', 'TSU18_PR'],

                         index=[1, 2, 3, 4, 5])

# This is not working since data frame and states do not have the same number of dimensions

df.loc[:, (slice(None),slice(None))].where(states_df != 0, np.nan, inplace=True)

I know I can achieve it that way, so it might no be a bit deal. Here is the desired output:

arrays = [['U', 'acidity', 'Al', 'U', 'U'],

          ['TSU16_PR', 'TSU16_PR', 'TSU16_PR', 'TSU17_PR', 'TSU18_PR']]

tuples = list(zip(*arrays))

columns = pd.MultiIndex.from_tuples(tuples, names=['elment', 'asset'])

states_df_2 = pd.DataFrame([[0.0, 0.0, 0.0, 1.0, 0.0],

                           [1.0, 1.0, 1.0, 1.0, 1.0],

                           [1.0, 1.0, 1.0, 0.0, 1.0],

                           [1.0, 1.0, 1.0, 1.0, 1.0],

                           [0.0, 0.0, 0.0, 1.0, 1.0]],

                           columns=columns,

                           index=[1, 2, 3, 4, 5])

df.where(states_df_2 != 0, np.nan, inplace = True)

In[1]: df

Out[1]: 

elment         U   acidity        Al         U          

asset   TSU16_PR  TSU16_PR  TSU16_PR  TSU17_PR  TSU18_PR

1            NaN       NaN       NaN  0.188960       NaN

2       1.920012 -1.355612  0.514419 -0.648037  0.461363

3       0.196968 -1.292682 -0.484867       NaN  0.373522

4      -0.340107  0.764010  1.081631 -0.141903  0.530718

5            NaN       NaN       NaN -0.732350 -1.148502

1 Answer

0 votes
by (36.8k points)
edited by

You can use DataFrame.reindex for mask and pass it to DataFrame.where:

df = df.where(states_df.reindex(df.columns, level=1, axis=1) != 0)

print (df)

elment         U   acidity        Al         U          

asset   TSU16_PR  TSU16_PR  TSU16_PR  TSU17_PR  TSU18_PR

1            NaN       NaN       NaN -0.434351       NaN

2       0.997345 -2.426679 -0.094709  2.205930  1.490732

3       0.282978 -0.428913  1.491390       NaN -0.935834

4      -1.506295  1.265936 -0.638902  1.004054  1.175829

5            NaN       NaN       NaN  0.386186 -1.253881

Detail:

print (states_df.reindex(df.columns, level=1, axis=1) != 0)

element        U  acidity       Al        U         

asset  TSU16_PR TSU16_PR TSU16_PR TSU17_PR TSU18_PR

1         False    False    False     True    False

2          True     True     True     True     True

3          True     True     True    False     True

4          True     True     True     True     True

5         False    False    False     True     True

Do check out Data Science with Python course which helps you understand from scratch.

Browse Categories

...