Skip to content

fixed repeat function #33

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 43 additions & 37 deletions src/utils/repeats.py
Original file line number Diff line number Diff line change
@@ -53,58 +53,64 @@
"Jacinto C. Nascimento",
"Diogo Araújo"]


import os
import logging
import pandas as pd
import warnings
from urllib3.exceptions import NotOpenSSLWarning
import argparse

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Suppress warnings
warnings.filterwarnings("ignore", category=NotOpenSSLWarning)

# Mapping file name
mapping_fn = "mamo_patients_mapping_data.csv"

# Define paths
root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
mapping_csv = os.path.join(root_dir, "data-images-breast", "data", "mapping", mapping_fn)

# Debugging output for paths
logging.info(f"Mapping CSV: {mapping_csv}")
def find_repeats(csv_file, columns):
"""Find and print rows in the CSV where the specified columns have repeated values."""
logging.info(f"Loading data from {csv_file}")
try:
df = pd.read_csv(csv_file)

for column in columns:
logging.info(f"Counting occurrences of values in column: {column}")
value_counts = df[column].value_counts()

logging.info(f"Filtering repeated values in column: {column}")
repeated_values = value_counts[value_counts > 1].index

if repeated_values.empty:
logging.info(f"No repeated values found in column: {column}")
else:
logging.info(f"Printing rows with repeated values in column: {column}")
repeated_rows = df[df[column].isin(repeated_values)]
print(f"\nRepeated rows in column '{column}':\n")
print(repeated_rows)

except Exception as e:
logging.error(f"An error occurred: {e}")

def main(csv_filename, columns_to_check):
logging.info("Starting repeat detection...")

# Define paths
root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
mapping_csv = os.path.join(root_dir, "data-images-breast", "data", "mapping", csv_filename)

# Define the column names you want to check for repeats
columns_to_check = ['anonymized_patient_id', 'real_patient_id'] # List of columns to check
# Debugging output for paths
logging.info(f"Mapping CSV: {mapping_csv}")

def find_repeats(csv_file, columns):
"""Find and print rows in the CSV where the specified columns have repeated values."""
logging.info(f"Loading data from {csv_file}")
try:
df = pd.read_csv(csv_file)
find_repeats(mapping_csv, columns_to_check)

for column in columns:
logging.info(f"Counting occurrences of values in column: {column}")
value_counts = df[column].value_counts()

logging.info(f"Filtering repeated values in column: {column}")
repeated_values = value_counts[value_counts > 1].index

if repeated_values.empty:
logging.info(f"No repeated values found in column: {column}")
else:
logging.info(f"Printing rows with repeated values in column: {column}")
repeated_rows = df[df[column].isin(repeated_values)]
print(f"\nRepeated rows in column '{column}':\n")
print(repeated_rows)

except Exception as e:
logging.error(f"An error occurred: {e}")
logging.info("Repeat detection complete!")

if __name__ == '__main__':
logging.info("Starting repeat detection...")
find_repeats(mapping_csv, columns_to_check)
logging.info("Repeat detection complete!")
parser = argparse.ArgumentParser(description="Detect repeated values in specified columns of a CSV file.")
parser.add_argument("csv_filename", help="Name of the CSV file to analyze")
parser.add_argument("columns", nargs='+', help="Columns to check for repeated values")
args = parser.parse_args()

main(args.csv_filename, args.columns)

# End of file
# End of file