step1 complete step2 started

This commit is contained in:
Patrick
2024-02-27 19:16:32 +01:00
parent 3b407bd956
commit 0632b1a7e1
5 changed files with 204 additions and 97 deletions

View File

@@ -8,11 +8,11 @@ def check_export_file(path):
result = chardet.detect(file.read())
detected_encoding = result['encoding']
print(detected_encoding)
# print(detected_encoding)
# Try: Datei in pandas einlesen
try:
return pd.read_csv(path, encoding=detected_encoding)
return pd.read_csv(path, encoding=detected_encoding, sep=';')
# Catch: zusätzliche Kommas entfernen
except pd.errors.ParserError as e:
# Wenn ein Parserfehler auftritt, gibt eine Fehlermeldung aus
@@ -26,7 +26,7 @@ def check_export_file(path):
print(f"Alle Kommas entfernt, einlesen wir erneut versucht ...")
# Nach Komma Ersetzung erneut versuchen
try:
return pd.read_csv(path, encoding=detected_encoding)
return pd.read_csv(path, encoding=detected_encoding, sep=';')
except pd.errors.ParserError as e:
print(f"Erneut Fehler in CSV-Datei: {e}")
print(f"Datei muss manuell geändert werden.")
@@ -35,16 +35,21 @@ def check_export_file(path):
# zum Einlesen der bisherigen Systemdaten
def create_dataframe_system(path):
try:
return pd.read_csv(path, encoding='utf')
return pd.read_csv(path, encoding='utf', sep=';')
except pd.errors.ParserError as e:
print(f"Fehler beim Einlesen der CSV")
# zum Extrahieren von Test- und Funktionsusern
def extract_testusers(dataframe, keywords):
testdata_df = pd.DataFrame(columns=dataframe.columns)
for index, row in dataframe.iterrows():
if any(row.str.contains(keywords)):
#extract = dataframe.loc[row.str.contains(keywords).name]
print(row.str.contains(keywords))
return testdata_df
def extract_testusers(df, keywords):
data = pd.DataFrame()
for keyword in keywords:
data = pd.concat([data, df[df.apply(contains_search_term, axis=1, key=keyword)]])
return data
def contains_search_term(row, key):
for value in row:
if key in str(value):
return True
return False

29
src/step2.py Normal file
View File

@@ -0,0 +1,29 @@
import pandas as pd
from Levenshtein import distance
def print_status(new, sys, bool_class):
print(f"\nEinträge in Import Liste: {len(new)}")
print(f"Einträge in System Liste: {len(sys)}")
if bool_class:
merged_df = pd.merge(new, sys, on=['name', 'vorname'], how='outer', indicator=True)
matches = pd.merge(new, sys, on=['name', 'vorname'])
matches = matches[['name', 'vorname', 'klasse']]
else:
merged_df = pd.merge(new, sys, on=['name', 'vorname'], how='outer', indicator=True)
matches = pd.merge(new, sys, on=['name', 'vorname'])
matches = matches[['name', 'vorname']]
# Subsets für Zeilen erstellen, die nur in einem der DataFrames vorhanden sind
only_new = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
only_sys = merged_df[merged_df['_merge'] == 'right_only'].drop(columns=['_merge'])
print("\nAnzahl Übereinstimmungen:", len(matches))
print("Anzahl neuer Nutzer:", len(only_new))
print("Anzahl veralteter Nutzer:", len(only_sys))
print(matches)
def search_typos(new, sys):
matches = pd.merge(new, sys, on=['name', 'vorname'])