fork download
  1. import pandas as pd
  2.  
  3.  
  4. def analyze_zero_variance(df_list):
  5. """
  6. Analyzes columns with zero variance in each DataFrame, calculating statistics across all DataFrames for common columns.
  7.  
  8. Args:
  9. df_list: List of Pandas DataFrames.
  10.  
  11. Returns:
  12. List of columns that were dropped due to zero variance.
  13. """
  14. REPOSITORY_ZEROVAR_THRESHOLD_VALUE = 0.7
  15.  
  16. # Get common columns across all DataFrames
  17. common_columns = set(df_list[0].columns).intersection(
  18. *[set(df.columns) for df in df_list[1:]])
  19. print("Common columns:", common_columns)
  20.  
  21. # Store statistics for common columns
  22. column_stats = {}
  23. for column in common_columns:
  24. variances = [df[column].var() for df in df_list]
  25. column_stats[column] = {'zero_variance_count': sum(
  26. variance < 1e-8 for variance in variances)}
  27.  
  28. # Print analysis for common columns
  29. print("Zero Variance Analysis across DataFrames:")
  30. for column, stats in column_stats.items():
  31. print(
  32. f"Column: {column}, Zero Variance Count: {stats['zero_variance_count']}")
  33.  
  34. # Drop columns with zero variance in more than the threshold of DataFrames
  35. columns_to_drop = [column for column, stats in column_stats.items(
  36. ) if stats['zero_variance_count'] > len(df_list) * REPOSITORY_ZEROVAR_THRESHOLD_VALUE]
  37. if columns_to_drop:
  38. print(
  39. f"Dropping columns with zero variance in more than {REPOSITORY_ZEROVAR_THRESHOLD_VALUE * 100}% of DataFrames: {columns_to_drop}")
  40. for i, df in enumerate(df_list):
  41. df_list[i] = df.drop(columns=columns_to_drop)
  42.  
  43. return columns_to_drop
  44.  
  45.  
  46. # Example usage:
  47. df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [0, 0, 0]})
  48. df2 = pd.DataFrame({'A': [1, 2, 3], 'B': [0, 0, 0]})
  49.  
  50. columns_to_drop = analyze_zero_variance([df1, df2])
  51.  
  52.  
  53. print("Columns to drop:", columns_to_drop)
  54.  
Success #stdin #stdout 0.3s 58476KB
stdin
Standard input is empty
stdout
Common columns: {'A', 'B'}
Zero Variance Analysis across DataFrames:
Column: A, Zero Variance Count: 0
Column: B, Zero Variance Count: 2
Dropping columns with zero variance in more than 70.0% of DataFrames: ['B']
Columns to drop: ['B']