Suggestion: add feature to show in detail changes in 1 df over time · Issue #30429 · pandas-dev/pandas (original) (raw)
Code Sample, a copy-pastable example if possible
import pandas as pd import numpy as np from tabulate import tabulate
def print_table(df, doc=False): """ Print out a nice looking table.
Set doc to True if you are printing this out for a Microsoft Word (11 x 8) document.
When doc is False:
Input:
A B
0 1 0
1 2 0
2 3 0
Output:
+----+-----+-----+
| | A | B |
|----+-----+-----|
| 0 | 1 | 0 |
| 1 | 2 | 0 |
| 2 | 3 | 0 |
+----+-----+-----+
When doc is True:
Input:
a b c d e f g h i j k l m n p
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12
13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13
14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14
Output:
Part 1/3
+----+-----+-----+-----+-----+-----+-----+
| | a | b | c | d | e | f |
|----+-----+-----+-----+-----+-----+-----|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| 3 | 3 | 3 | 3 | 3 | 3 | 3 |
| 4 | 4 | 4 | 4 | 4 | 4 | 4 |
+----+-----+-----+-----+-----+-----+-----+
Part 2/3
+----+-----+-----+-----+-----+-----+-----+
| | g | h | i | j | k | l |
|----+-----+-----+-----+-----+-----+-----|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| 3 | 3 | 3 | 3 | 3 | 3 | 3 |
| 4 | 4 | 4 | 4 | 4 | 4 | 4 |
+----+-----+-----+-----+-----+-----+-----+
Part 3/3
+----+-----+-----+-----+
| | m | n | p |
|----+-----+-----+-----|
| 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | 1 |
| 2 | 2 | 2 | 2 |
| 3 | 3 | 3 | 3 |
| 4 | 4 | 4 | 4 |
+----+-----+-----+-----+
:param df: DataFrame
:param doc: bool
:return: None
"""
def _simple_print(df):
"""
Single line helper function to print a dataframe.
:param df: DataFrame
:return: None
"""
print(tabulate(df, headers='keys', tablefmt='psql'))
# Just print with no extra slicing.
if not doc:
_simple_print(df)
# Slice the dataframe to show to Word.
else:
# Set the number of columns Word can handle. It typically can handle 6 columns.
max_columns = 6
# Find how many tables we will need to print if we cut the columns by 6ths.
max_tables = int(np.ceil(df.shape[1] / max_columns))
# For each 6th, print the table.
for iteration in range(1, max_tables + 1):
# Print which iteration we're on.
print(f'Part {iteration}/{max_tables}')
# Slice the larger dataframe to obtain the current dataframe.
current_df = df.iloc[0:5, (iteration * max_columns) - max_columns:iteration * max_columns]
# Print current dataframe.
_simple_print(current_df)
def test(old_df, new_df, desired_columns, index=None, verbose=False, print_full=True): """ Description ----------- Check for differences (across time, business rule changes, etc.) over the same dataframe.
Recommended to start without verbose, then add it if needed.
You can toggle whether to print out the sample dataframes (head and tail),
or not.
Sample Usage
------------
old_data = {'ID': ['CHA','COC','COF'],
'Name': ['Chai Tea', 'Cocoa', 'Coffee'],
'Description': ['Chai Kcup','Hot Chocolate Kcup','Coffee Kcup'],
'Cost': [2,2,3]}
new_data = {'ID': ['CHA','COC','COF'],
'Name': ['Chai Tea', 'Cocoa', 'Coffee'],
'Description': ['Chai','Hot Chocolate Kcup','Coffee Kcup'],
'Cost': [2,3,3]}
old_df = (pd.DataFrame(old_data)
.set_index('ID'))
new_df = (pd.DataFrame(new_data)
.set_index('ID'))
test(old_df=old_df, new_df=new_df, desired_columns=old_df.columns)
Output In Console
-----------------
Match? False
Quick Head Test
Old Records:
+------+----------+--------------------+--------+
| ID | Name | Description | Cost |
|------+----------+--------------------+--------|
| CHA | Chai Tea | Chai Kcup | 2 |
| COC | Cocoa | Hot Chocolate Kcup | 2 |
| COF | Coffee | Coffee Kcup | 3 |
+------+----------+--------------------+--------+
New Records:
+------+----------+--------------------+--------+
| ID | Name | Description | Cost |
|------+----------+--------------------+--------|
| CHA | Chai Tea | Chai | 2 |
| COC | Cocoa | Hot Chocolate Kcup | 3 |
| COF | Coffee | Coffee Kcup | 3 |
+------+----------+--------------------+--------+
Quick Tail Test
Old Records:
+------+----------+--------------------+--------+
| ID | Name | Description | Cost |
|------+----------+--------------------+--------|
| CHA | Chai Tea | Chai Kcup | 2 |
| COC | Cocoa | Hot Chocolate Kcup | 2 |
| COF | Coffee | Coffee Kcup | 3 |
+------+----------+--------------------+--------+
New Records:
+------+----------+--------------------+--------+
| ID | Name | Description | Cost |
|------+----------+--------------------+--------|
| CHA | Chai Tea | Chai | 2 |
| COC | Cocoa | Hot Chocolate Kcup | 3 |
| COF | Coffee | Coffee Kcup | 3 |
+------+----------+--------------------+--------+
Quick Cell to Cell Differences
All differences: (2, 2)
Old New
ID
CHA Description Chai Kcup Chai
COC Cost 2 3
Parameters
----------
:param old_df: DataFrame
This could be cases you know to be correct, or cases from last week, etc.
:param new_df: DataFrame
This could be the current output created by code you wish to debug or double
check, or cases from today, etc.
:param desired_columns: list of strings
<old_df.columns>
:param index: string
name of column to match cases against (currently not implemented)
<Student ID>
:return: None
"""
new_df = new_df[desired_columns]
old_df = old_df[desired_columns]
# overall test (returns true or false)
print(f'Match? {old_df.equals(new_df)}')
print('\n')
if print_full:
# quick head and tail test
print('Quick Head Test')
print('Old Records:')
print_table(old_df.head(5))
print('New Records:')
print_table(new_df.head(5))
print('\n')
print('Quick Tail Test')
print('Old Records:')
print_table(old_df.tail(5))
print('New Records:')
print_table(new_df.head(5))
print('\n')
def _diff_table(old_df, new_df):
"""
Create the table that only shows differences (or "errors").
This is code that manesioz on stackoverflow wrote as a response to my question
for how to achieve this.
:param old_df: pandas DataFrame
:param new_df: pandas DataFrame
:return: pandas DataFrame
"""
# create frame of comparison bools
bool_df = (old_df != new_df).stack()
diff_table = pd.concat([old_df.stack()[bool_df],
new_df.stack()[bool_df]],
axis=1)
diff_table.columns = ["Old", "New"]
return diff_table
diff_table = _diff_table(old_df,new_df)
if verbose==False:
# specific cell by cell test for differences
print('Quick Cell to Cell Differences')
print(f'All differences: {diff_table.shape}')
print(diff_table.head())
print('\n')
# Offer a verbose option. Because the following code can produce errors if the
# dataframes are too different (eg. if the column names differ), we offer the
# non-verbose option.
else:
print('Cell to Cell Differences: All')
print(f'All differences: {diff_table.shape}')
print(diff_table)
print('\n')
Problem description
Often I need to do a quick analysis of the same dataframe over some time. I need to look at the specific cells that changed, and how. Frequently, it's at a volume where it's impossible to just eyeball this. I'm surprised that under pandas' testing suite there's not currently an in-built method to do this.
I ended up using manesioz's code from my stackoverflow question here in my own utility function which is included here. But, I think a feature like this (specifically the output under Quick Cell to Cell Differences) could be really useful for other data analysts too.
More sample output of other times I've used this feature:
- Ad hoc, timed request to find any differences in ordering exams over a time period of a month from schools
>>>last_week_df.columns
Index(['BCO', 'Superintendent', 'DBN', 'OrderedExams', 'FLOrderedExams',
'Algebra I', 'ELA', 'Chemistry', 'Earth Science', 'Global Transition',
'Global II', 'Algebra II', 'Living Environment', 'US History',
'Physics', 'Geometry', 'All Exams'],
dtype='object')
>>>last_week_df.shape
(616, 17)
Cell to Cell Differences: All
All differences: (66, 2)
Old New
36 FLOrderedExams Yes No
40 FLOrderedExams Yes No
50 FLOrderedExams Yes No
66 FLOrderedExams Yes No
ELA 60 10
All Exams 110 60
81 OrderedExams No Yes
Algebra I 0 30
Living Environment 0 15
Geometry 0 25
All Exams 0 70
90 FLOrderedExams Yes No
165 OrderedExams Yes No
Algebra I 50 0
ELA 170 0
Earth Science 25 0
Global II 25 0
Algebra II 25 0
Living Environment 25 0
Geometry 15 0
All Exams 335 0
200 Physics 35 34
All Exams 1010 1009
282 Living Environment 250 150
All Exams 750 650
286 ELA 615 440
All Exams 1640 1465
432 Algebra I 695 620
All Exams 1841 1766
540 OrderedExams No Yes
Algebra I 0 130
ELA 0 50
Global Transition 0 30
Living Environment 0 140
US History 0 40
All Exams 0 390
552 OrderedExams No Yes
Algebra I 0 15
ELA 0 25
Chemistry 0 5
Earth Science 0 15
Global Transition 0 20
Algebra II 0 20
Living Environment 0 10
US History 0 10
Geometry 0 10
All Exams 0 130
560 OrderedExams No Yes
Algebra I 0 200
Global Transition 0 100
All Exams 0 300
561 OrderedExams No Yes
Algebra I 0 12
Living Environment 0 25
All Exams 0 37
576 OrderedExams No Yes
FLOrderedExams No Yes
Algebra I 0 30
ELA 0 140
All Exams 0 170
606 OrderedExams No Yes
FLOrderedExams No Yes
Earth Science 0 15
Living Environment 0 15
Geometry 0 25
All Exams 0 55
- Ad hoc, timed request to update a data file dependency for another file on locations of foreign language personnel at sites
MATHI
Match? False
Cell to Cell Differences: All
All differences: (5, 2)
Old New
Exams go to?
14J558 MATHI_K 10 2
MATHI_R 1 2
20J445 MATHI_K 1 3
28W440 MATHI_K 1 4
29W283 MATHI_K 1 4
USH
Match? False
Cell to Cell Differences: All
All differences: (3, 2)
Old New
Exams go to?
02N600 USH_K 1 0
USH_R 0 1
13J499 USH_K 2 1
PHYS
Match? False
Cell to Cell Differences: All
All differences: (8, 2)
Old New
Exams go to?
02N475 PHYS_K 1 0
PHYS_R 0 1
15J519 PHYS_K 4 2
PHYS_R 1 2
28W690 PHYS_K 1 3
30W445 PHYS_K 1 2
31T450 PHYS_K 1 0
PHYS_R 0 1
GLOBALI
Match? False
Cell to Cell Differences: All
All differences: (6, 2)
Old New
Exams go to?
19J660 GLOBALI_K 3 1
GLOBALI_R 1 2
21J540 GLOBALI_K 3 2
GLOBALI_R 1 2
29W272 GLOBALI_K 1 3
GLOBALI_R 2 1
LIVING
Match? True
Cell to Cell Differences: All
All differences: (0, 2)
Empty DataFrame
Columns: [Old, New]
Index: []
GLOBALII
Match? False
Cell to Cell Differences: All
All differences: (7, 2)
Old New
Exams go to?
13J499 GLOBALII_K 6 2
GLOBALII_R 1 2
19J660 GLOBALII_K 4 2
21J540 GLOBALII_K 2 3
28W620 GLOBALII_K 2 4
GLOBALII_R 2 1
29W272 GLOBALII_K 2 4