Issue
I have two dfs. one df has only 1 and 0 (df_one_zero). and another one has the different values df_value_total. These two has thousand rows and columns!
The first column of each df is the id and we dont want to change that at all.
I want to move with sliding window of 5 through the columns. so, each window I want to work with these two: df_one_zero_window, df_value_window.
In each window,the column which 1's started and ended is important.
Then I want to create another df_out with the same shape to the df_one_zero(initially is set to zero), consider that in column col the 1 started and ended in col_end,
put the value in df_out (row, col-1) = df_value_window(row, col-1)-df_value_window(row, col), and other values are zero.(if the 1 started at index 0, or end at the last column then its ok. it does not need to put value for that) Also, if the 1 in df_one_zero_window ended at col_end, then df_out(row, col_end+1) = df_value_window(row,col_end+1)-df_value_window(row,col_end). In the following dfs, I want to create the df_out= df2. The values in df_value_total are very diverse and here I only select some easy number in my df.
## only has zero and 1
df = pd.DataFrame()
df['id'] = ['a', 'b', 'c']
df['0'] = [0, 0, 0]
df['1'] = [1, 0, 1]
df['2'] = [1, 1, 1]
df['3'] = [0, 0, 0]
df['4'] = [0, 0, 0]
df['5'] = [0, 0, 0]
df['6'] = [0, 1, 1]
df['7'] = [0, 0, 1]
df['8'] = [0, 0, 0]
df['9'] = [0, 0, 0]
df['10'] = [0, 0, 0]
df['11'] = [0, 0, 1]
df['12'] = [1, 1, 1]
df['13'] = [1, 0, 0]
df['14'] = [0, 0, 0]
df['15'] = [0, 0, 0]
df['16'] = [0, 1, 1]
df['17'] = [1, 1, 0]
df['18'] = [0, 0, 0]
df['19'] = [0, 0, 0]
## this is that which has different values
df1 = pd.DataFrame()
df1['id'] = ['a', 'b', 'c']
df1['0'] = [4, 0, 9]
df1['1'] = [0, 0, 1]
df1['2'] = [1, 1, 3]
df1['3'] = [6, 2, 0]
df1['4'] = [0, 0, 0]
df1['5'] = [0, 5, 0]
df1['6'] = [0, 1, 2]
df1['7'] = [0, 0, 1]
df1['8'] = [0, 0, 3]
df1['9'] = [0, 0, 0]
df1['10'] = [0, 0, 0]
df1['11'] = [0, 0, 1]
df1['12'] = [1, 1, 1]
df1['13'] = [1, 3, 4]
df1['14'] = [9, 0, 0]
df1['15'] = [0, 0, 0]
df1['16'] = [2, 1, 1]
df1['17'] = [1, 1, 4]
df1['18'] = [0, 5, 0]
df1['19'] = [0, 0, 0]
I tried to do some parts, but I couldn't track where the 1 is finished and also I think it is not optimum!Can you please help me with that?
def generate_df_out(df_one_zero, df_value_total, window_size=5):
for col in range(1, len(df_one_zero.columns), window_size):
df1_window = df_one_zero.iloc[:, col:col + window_size]
df_value_window = df_value_total.iloc[:, col:col + window_size]
for row in range(df1_window.shape[0]):
start_idx = 0
for col in range(window_size):
if df1_window.iloc[row, col] == 1 and start_idx==0:
df_out.iloc[row, col-1] = df_value_window.iloc[row, col] - df_value_window.iloc[row, col-1]
start_idx += col
return df_out
df_out = generate_df_out(df, df1)
The output I want is like this:
df2 = pd.DataFrame()
df2['id'] = ['a', 'b', 'c']
df2['0'] = [4, 0, 8]
df2['1'] = [0, -1, 0]
df2['2'] = [0, 1, 0]
df2['3'] = [5, 1, -1]
df2['4'] = [0, 0, 0]
df2['5'] = [0, 4, -1]
df2['6'] = [0, 0, 0]
df2['7'] = [0, -1, 0]
df2['8'] = [0, 0, 2]
df2['9'] = [0, 0, 0]
df2['10'] = [0, 0, -1]
df2['11'] = [-1, -1, 0]
df2['12'] = [0, 0, 0]
df2['13'] = [0, 2, 3]
df2['14'] = [9, 0, 0]
df2['15'] = [0, -1, -1]
df2['16'] = [1, 0, 0]
df2['17'] = [0, 0, 3]
df2['18'] = [-1, 4, 0]
df2['19'] = [0, 0, 0]
df2
id 0 1 2 3 4 5 6 7 8 ... 10 11 12 13 14 15 16 17 18 19
0 a 4 0 0 5 0 0 0 0 0 ... 0 -1 1 1 9 0 1 0 -1 0
1 b 0 -1 1 1 0 4 0 -1 0 ... 0 -1 0 2 0 -1 0 0 4 0
2 c 8 0 0 -1 0 -1 0 0 2 ... -1 0 0 3 0 -1 0 3 0 0
Solution
To achieve this, you need a function that processes each window and updates df_out
accordingly. The function should iterate over each window, track the start and end of sequences of 1s in df_one_zero
, and compute the differences in df_value_total
based on these indices. Following is the updated function:
def generate_df_out(df_one_zero, df_value_total, window_size=5):
df_out = pd.DataFrame(0, index=df_one_zero.index, columns=df_one_zero.columns)
df_out['id'] = df_one_zero['id']
for col in range(1, len(df_one_zero.columns), window_size):
for row in range(df_one_zero.shape[0]):
sequence_started = False
for c in range(col, min(col + window_size, len(df_one_zero.columns))):
if df_one_zero.iloc[row, c] == 1:
if not sequence_started:
if c > 1: # Check for the start of the sequence
df_out.iloc[row, c - 1] = df_value_total.iloc[row, c - 1] - df_value_total.iloc[row, c]
sequence_started = True
if c + 1 < len(df_one_zero.columns) and df_one_zero.iloc[row, c + 1] == 0:
df_out.iloc[row, c + 1] = df_value_total.iloc[row, c + 1] - df_value_total.iloc[row, c]
return df_out
Sample Run:
import pandas as pd
## only has zero and 1
df = pd.DataFrame()
df['id'] = ['a', 'b', 'c']
df['0'] = [0, 0, 0]
df['1'] = [1, 0, 1]
df['2'] = [1, 1, 1]
df['3'] = [0, 0, 0]
df['4'] = [0, 0, 0]
df['5'] = [0, 0, 0]
df['6'] = [0, 1, 1]
df['7'] = [0, 0, 1]
df['8'] = [0, 0, 0]
df['9'] = [0, 0, 0]
df['10'] = [0, 0, 0]
df['11'] = [0, 0, 1]
df['12'] = [1, 1, 1]
df['13'] = [1, 0, 0]
df['14'] = [0, 0, 0]
df['15'] = [0, 0, 0]
df['16'] = [0, 1, 1]
df['17'] = [1, 1, 0]
df['18'] = [0, 0, 0]
df['19'] = [0, 0, 0]
## this is that which has different values
df1 = pd.DataFrame()
df1['id'] = ['a', 'b', 'c']
df1['0'] = [4, 0, 9]
df1['1'] = [0, 0, 1]
df1['2'] = [1, 1, 3]
df1['3'] = [6, 2, 0]
df1['4'] = [0, 0, 0]
df1['5'] = [0, 5, 0]
df1['6'] = [0, 1, 2]
df1['7'] = [0, 0, 1]
df1['8'] = [0, 0, 3]
df1['9'] = [0, 0, 0]
df1['10'] = [0, 0, 0]
df1['11'] = [0, 0, 1]
df1['12'] = [1, 1, 1]
df1['13'] = [1, 3, 4]
df1['14'] = [9, 0, 0]
df1['15'] = [0, 0, 0]
df1['16'] = [2, 1, 1]
df1['17'] = [1, 1, 4]
df1['18'] = [0, 5, 0]
df1['19'] = [0, 0, 0]
def generate_df_out(df_one_zero, df_value_total, window_size=5):
df_out = pd.DataFrame(0, index=df_one_zero.index, columns=df_one_zero.columns)
df_out['id'] = df_one_zero['id']
for col in range(1, len(df_one_zero.columns), window_size):
for row in range(df_one_zero.shape[0]):
sequence_started = False
for c in range(col, min(col + window_size, len(df_one_zero.columns))):
if df_one_zero.iloc[row, c] == 1:
if not sequence_started:
if c > 1: # Check for the start of the sequence
df_out.iloc[row, c - 1] = df_value_total.iloc[row, c - 1] - df_value_total.iloc[row, c]
sequence_started = True
if c + 1 < len(df_one_zero.columns) and df_one_zero.iloc[row, c + 1] == 0:
df_out.iloc[row, c + 1] = df_value_total.iloc[row, c + 1] - df_value_total.iloc[row, c]
return df_out
# Example usage
df_out = generate_df_out(df, df1)
print(df_out)
Output:
id 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
0 a 4 0 0 5 0 0 0 0 0 0 0 -1 0 0 8 0 1 0 -1 0
1 b 0 -1 0 1 0 4 0 -1 0 0 0 -1 0 2 0 -1 0 0 4 0
2 c 8 0 0 -3 0 -2 0 0 2 0 -1 0 0 3 0 -1 0 3 0 0
Answered By - Bilesh Ganguly
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.