step2 abgeschlossen
This commit is contained in:
39
src/step2.py
39
src/step2.py
@@ -2,28 +2,47 @@ import pandas as pd
|
||||
from Levenshtein import distance
|
||||
|
||||
|
||||
def print_status(new, sys, bool_class):
|
||||
def compare_data(new, sys, bool_class):
|
||||
print(f"\nEinträge in Import Liste: {len(new)}")
|
||||
print(f"Einträge in System Liste: {len(sys)}")
|
||||
|
||||
if bool_class:
|
||||
merged_df = pd.merge(new, sys, on=['name', 'vorname'], how='outer', indicator=True)
|
||||
if 'index' in new.columns:
|
||||
new = new.drop('index', axis=1)
|
||||
sys = sys.drop(columns=['klasse'])
|
||||
merged_df = pd.merge(new[['name', 'vorname', 'klasse']], sys, on=['name', 'vorname'], how='outer', indicator=True)
|
||||
matches = pd.merge(new, sys, on=['name', 'vorname'])
|
||||
matches = matches[['name', 'vorname', 'klasse']]
|
||||
|
||||
else:
|
||||
merged_df = pd.merge(new, sys, on=['name', 'vorname'], how='outer', indicator=True)
|
||||
merged_df = pd.merge(new[['name', 'vorname']], sys, on=['name', 'vorname'], how='outer', indicator=True)
|
||||
matches = pd.merge(new, sys, on=['name', 'vorname'])
|
||||
matches = matches[['name', 'vorname']]
|
||||
# matches = matches[['name', 'vorname', 'schuelerid']]
|
||||
|
||||
# Subsets für Zeilen erstellen, die nur in einem der DataFrames vorhanden sind
|
||||
only_new = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
|
||||
only_sys = merged_df[merged_df['_merge'] == 'right_only'].drop(columns=['_merge'])
|
||||
|
||||
print("\nAnzahl Übereinstimmungen:", len(matches))
|
||||
print("Anzahl neuer Nutzer:", len(only_new))
|
||||
print("Anzahl veralteter Nutzer:", len(only_sys))
|
||||
print(matches)
|
||||
print_status(matches, only_new, only_sys)
|
||||
search_typos(only_new[['name', 'vorname']], only_sys[['name', 'vorname']])
|
||||
|
||||
return matches, only_new
|
||||
|
||||
|
||||
def search_typos(new, sys):
|
||||
matches = pd.merge(new, sys, on=['name', 'vorname'])
|
||||
typos = []
|
||||
col1 = 'vorname'
|
||||
col2 = 'name'
|
||||
for idx1, row1 in new.iterrows():
|
||||
for idx2, row2 in sys.iterrows():
|
||||
if distance(row1[col1], row2[col1]) <= 2 and distance(row1[col2], row2[col2]) <= 2:
|
||||
typos.append((row1[col1], row1[col2], row2[col1], row2[col2]))
|
||||
if len(typos) > 0:
|
||||
print('Mögliche Tippfehler:', len(typos), '\n', typos)
|
||||
else:
|
||||
print('Mögliche Tippfehler: keine Fehler gefunden!')
|
||||
|
||||
|
||||
def print_status(matches, new, old):
|
||||
print("\nAnzahl Übereinstimmungen:", len(matches))
|
||||
print("Anzahl neuer Nutzer:", len(new))
|
||||
print("Anzahl veralteter Nutzer:", len(old))
|
||||
|
||||
Reference in New Issue
Block a user