updated project structure
This commit is contained in:
140
first_try/compareL.py
Normal file
140
first_try/compareL.py
Normal file
@@ -0,0 +1,140 @@
|
||||
import csv
|
||||
from Levenshtein import distance
|
||||
import pandas as pd
|
||||
import uuid
|
||||
|
||||
|
||||
# TODO Filter für Spalten, ggfs. Klasse benötigt
|
||||
|
||||
# TODO Filter für Dublikate hier wird dann die Klasse benötigt
|
||||
|
||||
|
||||
def read_csv(file_path):
|
||||
data = []
|
||||
with open(file_path, newline='', encoding='utf-8') as csvfile:
|
||||
reader = csv.reader(csvfile, delimiter=';')
|
||||
for row in reader:
|
||||
data.append((row[0].strip(), row[1].strip()))
|
||||
return data
|
||||
|
||||
|
||||
def similar_sets(pair, data):
|
||||
similar_pairs = []
|
||||
for item in data:
|
||||
if distance(pair[0], item[0]) <= 1 and distance(pair[1], item[1]) <= 1:
|
||||
similar_pairs.append(item)
|
||||
return similar_pairs
|
||||
|
||||
|
||||
def compare_csv(file1, file2):
|
||||
data1 = read_csv(file1)
|
||||
data2 = read_csv(file2)
|
||||
|
||||
common_pairs = set(data1) & set(data2)
|
||||
unique_pairs1 = set(data1) - common_pairs
|
||||
unique_pairs2 = set(data2) - common_pairs
|
||||
|
||||
return common_pairs, unique_pairs1, unique_pairs2, data1, data2
|
||||
|
||||
|
||||
def find_similar_pairs(pair, other_data):
|
||||
similar_pairs = []
|
||||
for item in other_data:
|
||||
if distance(pair[0], item[0]) <= 2 and distance(pair[1], item[1]) <= 2:
|
||||
similar_pairs.append(item)
|
||||
return similar_pairs
|
||||
|
||||
|
||||
def create_uuid():
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
def add_hl_tag(row):
|
||||
klasse = str(row['klasse']).lstrip('0')
|
||||
return 'HL0707104-' + klasse
|
||||
|
||||
|
||||
def create_import_list(path, path_new, old_pairs, new_pairs, common_pairs):
|
||||
system_data = pd.read_csv(path, sep=';', encoding='utf-8')
|
||||
system_data = system_data[~system_data[['name', 'vorname']].apply(tuple, axis=1).isin(old_pairs)]
|
||||
# print(len(system_data))
|
||||
# print(system_data)
|
||||
new_data = pd.read_csv(path_new, sep=';', encoding='utf-8')
|
||||
|
||||
# Bei Schüler: alte Klassen gelöscht, mit neuen Klassen aus new-data auffüllen
|
||||
|
||||
matches = new_data[~new_data[['name', 'vorname']].apply(tuple, axis=1).isin(new_pairs)]
|
||||
# matches.loc[:, 'klasse'] = matches.apply(add_hl_tag, axis=1)
|
||||
# print(len(matches))
|
||||
# print(matches)
|
||||
system_data = pd.merge(system_data, matches, how='outer', left_on=['name', 'vorname'], right_on=['name', 'vorname'])
|
||||
system_data = system_data[['name', 'vorname', 'klasse', 'schuelerid']]
|
||||
system_data = system_data.drop('klasse', axis=1, errors='ignore')
|
||||
print(system_data.columns)
|
||||
print("Passende Einträge:" + str(len(system_data)))
|
||||
|
||||
new_data = new_data[~new_data[['name', 'vorname']].apply(tuple, axis=1).isin(common_pairs)]
|
||||
# new_data = new_data.drop('Unnamed: 2', axis=1, errors='ignore')
|
||||
new_uuids = []
|
||||
for row in range(len(new_data)):
|
||||
new_uuids.append(create_uuid())
|
||||
|
||||
new_data.insert(loc=2, column='schuelerid', value=new_uuids)
|
||||
mailUserQuota = 2048
|
||||
oxUserQuota = 20480
|
||||
oxContext = 25
|
||||
|
||||
print(new_data.columns)
|
||||
print("New Data:" + str(len(new_data)))
|
||||
|
||||
# vor dem merge daten ergänzen
|
||||
# import_df = pd.merge(system_data, new_data, how='outer', left_on=['name', 'vorname', 'mailUserQuota', 'oxUserQuota', 'oxContext'], right_on=['name', 'vorname', 'mailUserQuota', 'oxUserQuota', 'oxContext'])
|
||||
# import_df = pd.merge(system_data, new_data, how='outer',
|
||||
# left_on=['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota',
|
||||
# 'oxContext'],
|
||||
# right_on=['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota',
|
||||
# 'oxContext'])
|
||||
import_df = pd.merge(system_data, new_data, how='outer',
|
||||
left_on=['name', 'vorname', 'schuelerid'],
|
||||
right_on=['name', 'vorname', 'schuelerid'])
|
||||
import_df['mailUserQuota'] = mailUserQuota
|
||||
import_df['oxUserQuota'] = oxUserQuota
|
||||
import_df['oxContext'] = oxContext
|
||||
import_df['klasse'] = None
|
||||
import_df = import_df[['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota', 'oxContext']]
|
||||
|
||||
# pd.set_option('display.max_rows', None)
|
||||
# pd.set_option('display.max_columns', None)
|
||||
# print(import_df)
|
||||
# pd.reset_option('display.max_rows')
|
||||
# pd.reset_option('display.max_columns')
|
||||
out_path = '../Data/GPS/outputLehrer.csv'
|
||||
import_df.to_csv(out_path, sep=';', index=False)
|
||||
print(len(import_df))
|
||||
print('Lehrer Import Liste erzeugt')
|
||||
print('Testuser manuell nachtragen!!')
|
||||
|
||||
|
||||
def main():
|
||||
file1_path = '../Data/GPS/gpsLnew.csv'
|
||||
file2_path = '../Data/GPS/gpsLold2.cvs'
|
||||
file3_path = '../Data/GPS/gpsLold.csv'
|
||||
|
||||
common_pairs, new_pairs, old_pairs, data1, data2 = compare_csv(file1_path, file2_path)
|
||||
|
||||
print(f"Anzahl der übereinstimmenden Paare: {len(common_pairs)}")
|
||||
print(f"Anzahl der neuen Einträge: {len(new_pairs)}")
|
||||
print(f"Anzahl der veralteten Einträge: {len(old_pairs)}")
|
||||
|
||||
# Paare nur aus nicht zugeordneten Paaren aus neuer Liste erstellen
|
||||
for pair in data1:
|
||||
similar_pairs_list2 = find_similar_pairs(pair, set(data2) - {pair})
|
||||
|
||||
if similar_pairs_list2:
|
||||
print(f"Ähnliche Paare in neuer Liste {pair} aktuell im System: {similar_pairs_list2}")
|
||||
|
||||
create_import_list(file3_path, file1_path, old_pairs, new_pairs, common_pairs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
141
first_try/compareS.py
Normal file
141
first_try/compareS.py
Normal file
@@ -0,0 +1,141 @@
|
||||
import csv
|
||||
from Levenshtein import distance
|
||||
import pandas as pd
|
||||
import uuid
|
||||
|
||||
|
||||
# TODO Filter für Spalten, ggfs. Klasse benötigt
|
||||
|
||||
# TODO Filter für Dublikate hier wird dann die Klasse benötigt
|
||||
|
||||
|
||||
def read_csv(file_path):
|
||||
data = []
|
||||
with open(file_path, newline='', encoding='utf-8') as csvfile:
|
||||
reader = csv.reader(csvfile, delimiter=';')
|
||||
for row in reader:
|
||||
data.append((row[0].strip(), row[1].strip()))
|
||||
return data
|
||||
|
||||
|
||||
def similar_sets(pair, data):
|
||||
similar_pairs = []
|
||||
for item in data:
|
||||
if distance(pair[0], item[0]) <= 1 and distance(pair[1], item[1]) <= 1:
|
||||
similar_pairs.append(item)
|
||||
return similar_pairs
|
||||
|
||||
|
||||
def compare_csv(file1, file2):
|
||||
data1 = read_csv(file1)
|
||||
data2 = read_csv(file2)
|
||||
|
||||
common_pairs = set(data1) & set(data2)
|
||||
unique_pairs1 = set(data1) - common_pairs
|
||||
unique_pairs2 = set(data2) - common_pairs
|
||||
|
||||
return common_pairs, unique_pairs1, unique_pairs2, data1, data2
|
||||
|
||||
|
||||
def find_similar_pairs(pair, other_data):
|
||||
similar_pairs = []
|
||||
for item in other_data:
|
||||
if distance(pair[0], item[0]) <= 2 and distance(pair[1], item[1]) <= 2:
|
||||
similar_pairs.append(item)
|
||||
return similar_pairs
|
||||
|
||||
|
||||
def create_uuid():
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
def add_hl_tag(row):
|
||||
klasse = str(row['klasse']).lstrip('0')
|
||||
return 'HL0707104-' + klasse
|
||||
|
||||
|
||||
def create_import_list(path, path_new, old_pairs, new_pairs, common_pairs):
|
||||
system_data = pd.read_csv(path, sep=';', encoding='utf-8')
|
||||
system_data = system_data[~system_data[['name', 'vorname']].apply(tuple, axis=1).isin(old_pairs)]
|
||||
system_data = system_data.drop('username', axis=1, errors='ignore')
|
||||
system_data = system_data.drop('klasse', axis=1, errors='ignore')
|
||||
# print(len(system_data))
|
||||
# print(system_data)
|
||||
new_data = pd.read_csv(path_new, sep=';', encoding='utf-8')
|
||||
|
||||
# Bei Schüler: alte Klassen gelöscht, mit neuen Klassen aus new-data auffüllen
|
||||
|
||||
matches = new_data[~new_data[['name', 'vorname']].apply(tuple, axis=1).isin(new_pairs)]
|
||||
matches.loc[:, 'klasse'] = matches.apply(add_hl_tag, axis=1)
|
||||
# print(len(matches))
|
||||
# print(matches)
|
||||
system_data = pd.merge(system_data, matches, how='outer', left_on=['name', 'vorname'], right_on=['name', 'vorname'])
|
||||
system_data = system_data[['name', 'vorname', 'klasse', 'schuelerid']]
|
||||
# print(system_data)
|
||||
print(len(system_data))
|
||||
|
||||
new_data = new_data[~new_data[['name', 'vorname']].apply(tuple, axis=1).isin(common_pairs)]
|
||||
# new_data = new_data.drop('Unnamed: 2', axis=1, errors='ignore')
|
||||
new_uuids = []
|
||||
for row in range(len(new_data)):
|
||||
new_uuids.append(create_uuid())
|
||||
# Klasse?? Unterschied zwischen Lehrer und Schüler
|
||||
# new_data['klasse'] = None
|
||||
new_data.loc[:, 'klasse'] = new_data.apply(add_hl_tag, axis=1)
|
||||
new_data.insert(loc=2, column='schuelerid', value=new_uuids)
|
||||
mailUserQuota = 1024
|
||||
oxUserQuota = 5120
|
||||
oxContext = 25
|
||||
|
||||
# print(new_data)
|
||||
print(len(new_data))
|
||||
|
||||
# vor dem merge daten ergänzen
|
||||
# import_df = pd.merge(system_data, new_data, how='outer', left_on=['name', 'vorname', 'mailUserQuota', 'oxUserQuota', 'oxContext'], right_on=['name', 'vorname', 'mailUserQuota', 'oxUserQuota', 'oxContext'])
|
||||
# import_df = pd.merge(system_data, new_data, how='outer',
|
||||
# left_on=['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota',
|
||||
# 'oxContext'],
|
||||
# right_on=['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota',
|
||||
# 'oxContext'])
|
||||
import_df = pd.merge(system_data, new_data, how='outer',
|
||||
left_on=['name', 'vorname', 'klasse', 'schuelerid'],
|
||||
right_on=['name', 'vorname', 'klasse', 'schuelerid'])
|
||||
import_df['mailUserQuota'] = mailUserQuota
|
||||
import_df['oxUserQuota'] = oxUserQuota
|
||||
import_df['oxContext'] = oxContext
|
||||
|
||||
# pd.set_option('display.max_rows', None)
|
||||
# pd.set_option('display.max_columns', None)
|
||||
# print(import_df)
|
||||
# pd.reset_option('display.max_rows')
|
||||
# pd.reset_option('display.max_columns')
|
||||
out_path = '../Data/GPS/outputSchueler.csv'
|
||||
import_df.to_csv(out_path, sep=';', index=False)
|
||||
print(len(import_df))
|
||||
print('Schüler Import Liste erzeugt')
|
||||
print('Testuser manuell nachtragen!!')
|
||||
|
||||
|
||||
def main():
|
||||
file1_path = '../Data/GPS/gpsSnew.csv'
|
||||
file2_path = '../Data/GPS/gpsSold2.cvs'
|
||||
file3_path = '../Data/GPS/gpsSold.csv'
|
||||
|
||||
common_pairs, new_pairs, old_pairs, data1, data2 = compare_csv(file1_path, file2_path)
|
||||
|
||||
print(f"Anzahl der übereinstimmenden Paare: {len(common_pairs)}")
|
||||
print(f"Anzahl der neuen Einträge: {len(new_pairs)}")
|
||||
print(f"Anzahl der veralteten Einträge: {len(old_pairs)}")
|
||||
|
||||
# Paare nur aus nicht zugeordneten Paaren aus neuer Liste erstellen
|
||||
# for pair in data1:
|
||||
# similar_pairs_list2 = find_similar_pairs(pair, set(data2) - {pair})
|
||||
#
|
||||
# if similar_pairs_list2:
|
||||
# print(f"Ähnliche Paare in neuer Liste {pair} aktuell im System: {similar_pairs_list2}")
|
||||
|
||||
create_import_list(file3_path, file1_path, old_pairs, new_pairs, common_pairs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
70
first_try/format_csv.py
Normal file
70
first_try/format_csv.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import pandas as pd
|
||||
import chardet
|
||||
import csv
|
||||
|
||||
|
||||
def check_file(path):
|
||||
with open(path, 'rb') as file:
|
||||
result = chardet.detect(file.read())
|
||||
|
||||
detected_encoding = result['encoding']
|
||||
try:
|
||||
pd.read_csv(path, encoding=detected_encoding)
|
||||
except pd.errors.ParserError as e:
|
||||
# Wenn ein Parserfehler auftritt, gibt eine Fehlermeldung aus
|
||||
print(f"Fehler beim Einlesen der CSV-Datei: {e}")
|
||||
print()
|
||||
data = open(path, "r")
|
||||
data = ''.join([i for i in data]).replace(",", "")
|
||||
x = open(path, "w")
|
||||
x.writelines(data)
|
||||
x.close()
|
||||
print(f"Alle Kommas entfernt")
|
||||
|
||||
|
||||
# Prüft Formatierung der CSV, formatiert diese zu utf-8 und speichert das Ergebnis als neue Liste
|
||||
def format_csv(path, type):
|
||||
with open(path, 'rb') as file:
|
||||
result = chardet.detect(file.read())
|
||||
|
||||
detected_encoding = result['encoding']
|
||||
|
||||
# CSV-Datei mit Pandas einlesen
|
||||
try:
|
||||
df = pd.read_csv(path, encoding=detected_encoding)
|
||||
print("Datei erfolgreich eingelesen.")
|
||||
df.to_csv(type, index=False, encoding='utf-8')
|
||||
print("UTF-8 Kopie erfolgreich erstellt.")
|
||||
|
||||
except pd.errors.ParserError as e:
|
||||
# Wenn ein Parserfehler auftritt, gibt eine Fehlermeldung aus
|
||||
print(f"Fehler beim Einlesen der CSV-Datei: {e}")
|
||||
|
||||
|
||||
def clean_data(path, clean):
|
||||
try:
|
||||
# Lese den Header der CSV-Datei
|
||||
with open(path, 'r', newline='', encoding='utf-8') as csvfile:
|
||||
reader = csv.reader(csvfile, delimiter=';')
|
||||
header = next(reader)
|
||||
# Finde die Indizes der Spalten 'Name' und 'Vorname' und 'Klasse'
|
||||
name_index = header.index('name')
|
||||
vorname_index = header.index('vorname')
|
||||
klasse_index = header.index('klasse')
|
||||
|
||||
# Öffne die CSV-Datei im Schreibmodus und schreibe nur die gewünschten Spalten zurück
|
||||
with open(clean, 'w', newline='', encoding='utf-8') as csvfile2:
|
||||
writer = csv.writer(csvfile2, delimiter=';')
|
||||
|
||||
# Schreibe den neuen Header mit 'Name' und 'Vorname'
|
||||
writer.writerow(['name', 'vorname', 'klasse'])
|
||||
print(name_index, vorname_index, klasse_index)
|
||||
for row in reader:
|
||||
writer.writerow([row[name_index], row[vorname_index], row[klasse_index]])
|
||||
|
||||
print(f'Nur die Spalten "Name" und "Vorname" in der CSV-Datei {path} wurden beibehalten.')
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f'Die Datei {path} wurde nicht gefunden.')
|
||||
except ValueError:
|
||||
print(f'Die Spalten "Name" und "Vorname" wurden nicht gefunden.')
|
||||
26
first_try/main.py
Normal file
26
first_try/main.py
Normal file
@@ -0,0 +1,26 @@
|
||||
from first_try.format_csv import *
|
||||
|
||||
if __name__ == "__main__":
|
||||
oldlist = 'Data/alte-liste-utf.csv'
|
||||
newlist = 'Data/neue-liste-utf.csv'
|
||||
|
||||
test = 'GPS/lehrer_HL0707113.csv_intern'
|
||||
new_test = 'GPS/gpsLold.csv'
|
||||
clean = 'GPS/gpsLold2.cvs'
|
||||
|
||||
# Liste nach Fehler prüfen und Zeichen ersetzen
|
||||
check_file(test)
|
||||
format_csv(test, new_test)
|
||||
clean_data(new_test, clean)
|
||||
|
||||
# path_old_csv = input('Pfad zur alten Liste eingeben eingeben: ')
|
||||
# format_csv(path_old_csv, oldlist)
|
||||
# path_new_csv = input('Pfad zur neuen Liste eingeben eingeben: ')
|
||||
# format_csv(path_new_csv, newlist)
|
||||
|
||||
# Generierte Listen mit pandas öffnen
|
||||
# Spaltennamen prüfen und überflüssige Spalten löschen
|
||||
# Abgleich starten -collisions
|
||||
# TODO Testuser + Spezialfälle Raumaccounts etc. "Tafel" "Raum"
|
||||
# TODO Fehlerhafte Sonderzeichen erkennen (oxport_schueler.csv - 171)
|
||||
# TODO File / Ausgabe von Gesamten Zahlen zum Abgleich mit DryRun (ImportLines, Created, Modified, Delete)
|
||||
38
first_try/output.py
Normal file
38
first_try/output.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import csv
|
||||
|
||||
# Zielformat: ${nachname};${vorname};HL070${SCHOOL}-${klasse};${recordID};1024;${ox_quota};${ox_context}
|
||||
|
||||
|
||||
def create_output(path):
|
||||
schule = input('Schule: ')
|
||||
record_id = input('Record ID: ')
|
||||
mail_quota = input('Mail')
|
||||
ox_quota = input('OX Quota: ')
|
||||
ox_context = input('OX Context: ')
|
||||
|
||||
data = []
|
||||
with open(path, newline='', encoding='utf-8') as csvfile:
|
||||
reader = csv.reader(csvfile, delimiter=';')
|
||||
for row in reader:
|
||||
data.append((row[0].strip(), row[1].strip(), schule, record_id, mail_quota, ox_quota, ox_context))
|
||||
|
||||
csv_file_path = '../Data/output.csv'
|
||||
|
||||
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
csv_writer = csv.writer(csvfile, delimiter=';')
|
||||
|
||||
# Schreibe die Header-Zeile (optional)
|
||||
# name;vorname;klasse;schuelerid;mailUserQuota;oxUserQuota;oxContext
|
||||
csv_writer.writerow(['name', 'vorname', 'klasse', 'schuelerid', 'mailUserQuota', 'oxUserQuota', 'oxContext'])
|
||||
# TODO UUID prüfen bzw generien
|
||||
# Schreibe die Daten aus dem Array in die CSV-Datei
|
||||
csv_writer.writerows(data)
|
||||
|
||||
print(f"CSV-Datei wurde erfolgreich erstellt: {csv_file_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
create_output('../Data/test_new.csv')
|
||||
|
||||
# TODO Leerzeilen löschen
|
||||
# TODO Klassenname umformatieren - HL070**** Nummer einfügen
|
||||
Reference in New Issue
Block a user