From d387d468c76a59239ee83e95731ec5b6ae7ce87e Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 1 Apr 2026 14:29:05 +0200 Subject: [PATCH] clean up --- .gitignore | 1 + .idea/.gitignore | 8 + gewinnspiel/cleanUpCsv.php | 320 ++++++++++++++++++++++++++++++------- 3 files changed, 271 insertions(+), 58 deletions(-) create mode 100644 .gitignore create mode 100644 .idea/.gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..469ac46 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea/encodings.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/gewinnspiel/cleanUpCsv.php b/gewinnspiel/cleanUpCsv.php index 0783aa1..ede89d4 100644 --- a/gewinnspiel/cleanUpCsv.php +++ b/gewinnspiel/cleanUpCsv.php @@ -19,8 +19,12 @@ if (!$handle) { exit(1); } -const COL_STREET = 5; -const COL_EMAIL = 7; +const COL_FIRSTNAME = 3; +const COL_LASTNAME = 4; +const COL_STREET = 5; +const COL_PLZORT = 6; +const COL_EMAIL = 7; +const COL_PHONE = 8; // Delimiter auto-erkennen $firstLine = fgets($handle); @@ -30,12 +34,9 @@ rewind($handle); echo "Erkannter Delimiter: '$delimiter'\n"; -// Header einlesen $header = fgetcsv($handle, 0, $delimiter); $header = array_map(fn($cell) => mb_convert_encoding($cell, 'UTF-8', 'ISO-8859-1'), $header); -$emailMap = []; -$streetMap = []; $rows = []; $rowNumber = 1; @@ -43,19 +44,6 @@ while (($row = fgetcsv($handle, 0, $delimiter)) !== false) { $rowNumber++; $row = array_map(fn($cell) => mb_convert_encoding($cell, 'UTF-8', 'ISO-8859-1'), $row); $rows[$rowNumber] = $row; - - $email = mb_strtolower(trim($row[COL_EMAIL] ?? '')); - $street = mb_strtolower(trim($row[COL_STREET] ?? '')); - - if ($email !== '') { - $emailMap[$email]['original'] = trim($row[COL_EMAIL]); - $emailMap[$email]['rows'][] = $rowNumber; - } - - if ($street !== '') { - $streetMap[$street]['original'] = trim($row[COL_STREET]); - $streetMap[$street]['rows'][] = $rowNumber; - } } fclose($handle); @@ -64,35 +52,127 @@ fclose($handle); // Hilfsfunktionen // ------------------------------------------------------- -function printBlock(array $data, string $label, array $header, array $rows): void +function normalizeUmlauts(string $str): string +{ + return str_replace( + ['ä', 'ö', 'ü', 'ß', 'Ä', 'Ö', 'Ü'], + ['ae', 'oe', 'ue', 'ss', 'ae', 'oe', 'ue'], + $str + ); +} + +function normalizeName(string $name): string +{ + return normalizeUmlauts(mb_strtolower(trim($name))); +} + +function firstToken(string $name): string +{ + return preg_split('/\s+/', trim($name))[0] ?? ''; +} + +function isSimilar(string $a, string $b, int $threshold = 80): bool +{ + similar_text($a, $b, $percent); + return $percent >= $threshold; +} + +function nameSimilar(string $a, string $b): bool +{ + return isSimilar(normalizeName($a), normalizeName($b)); +} + +function firstNameSimilar(string $a, string $b): bool +{ + return isSimilar(normalizeName(firstToken($a)), normalizeName(firstToken($b))); +} + +function extractPlz(string $plzOrt): string +{ + preg_match('/\d{4,5}/', $plzOrt, $matches); + return $matches[0] ?? ''; +} + +function extractOrt(string $plzOrt): string +{ + return mb_strtolower(trim(preg_replace('/^\d{4,5}\s*/', '', trim($plzOrt)))); +} + +function normalizePhone(string $phone): string +{ + return preg_replace('/\D/', '', $phone); +} + +function isDuplicate(array $rowA, array $rowB) +{ + if (!nameSimilar($rowA[COL_LASTNAME], $rowB[COL_LASTNAME])) return false; + if (!firstNameSimilar($rowA[COL_FIRSTNAME], $rowB[COL_FIRSTNAME])) return false; + + $reasons = []; + + $plzA = extractPlz($rowA[COL_PLZORT] ?? ''); + $plzB = extractPlz($rowB[COL_PLZORT] ?? ''); + $ortA = extractOrt($rowA[COL_PLZORT] ?? ''); + $ortB = extractOrt($rowB[COL_PLZORT] ?? ''); + $strA = mb_strtolower(trim($rowA[COL_STREET] ?? '')); + $strB = mb_strtolower(trim($rowB[COL_STREET] ?? '')); + $phoneA = normalizePhone($rowA[COL_PHONE] ?? ''); + $phoneB = normalizePhone($rowB[COL_PHONE] ?? ''); + + if ($plzA !== '' && $plzB !== '' && $plzA === $plzB) { + $reasons[] = "gleiche PLZ ($plzA)"; + } + + if ($ortA !== '' && $ortB !== '' && isSimilar($ortA, $ortB)) { + $reasons[] = "ähnlicher Ort ($ortA ≈ $ortB)"; + } + + if ($strA !== '' && $strB !== '' && isSimilar($strA, $strB)) { + $reasons[] = "ähnliche Straße ($strA ≈ $strB)"; + } + + if ($phoneA !== '' && $phoneB !== '' && $phoneA === $phoneB) { + $reasons[] = "gleiche Telefonnummer ($phoneA)"; + } + + if ($plzA === '' || $plzB === '') { + $reasons[] = "PLZ/Ort fehlt bei einem Eintrag"; + } + + if (empty($reasons)) return false; + + return $reasons; +} + +function printBlock(array $rowNums, array $header, array $rows, array $reasons): void { - $colWidth = 22; + $colWidth = 20; $separator = str_repeat('-', count($header) * ($colWidth + 3)) . "\n"; $headerLine = implode(' | ', array_map(fn($h) => str_pad(mb_substr($h, 0, $colWidth), $colWidth), $header)); echo "\n" . $separator; - echo "$label: {$data['original']}\n"; + echo "Grund: " . implode(', ', $reasons) . "\n"; echo $separator; echo "Zeile | $headerLine\n"; echo $separator; - foreach ($data['rows'] as $rowNum) { + foreach ($rowNums as $rowNum) { $cells = array_map( fn($cell) => str_pad(mb_substr($cell, 0, $colWidth), $colWidth), $rows[$rowNum] ); - echo " $rowNum | " . implode(' | ', $cells) . "\n"; + echo " " . str_pad((string)$rowNum, 5) . "| " . implode(' | ', $cells) . "\n"; } echo $separator; } -function askDelete(array $data, string $label, array $header, array $rows, array &$rowsToDelete): void +function askDelete(array $rowNums, array $header, array $rows, array $reasons, array &$rowsToDelete): void { - printBlock($data, $label, $header, $rows); + printBlock($rowNums, $header, $rows, $reasons); - $keep = $data['rows'][0]; - $delete = array_slice($data['rows'], 1); + $keep = $rowNums[0]; + $delete = array_slice($rowNums, 1); echo "Behalten: Zeile $keep – Löschen: Zeilen " . implode(', ', $delete) . "\n"; echo "[1] Löschen [Enter] Überspringen: "; @@ -108,54 +188,177 @@ function askDelete(array $data, string $label, array $header, array $rows, array } } +function progressBar(int $current, int $total, int $width = 30): string +{ + $pct = $total > 0 ? $current / $total : 1; + $filled = (int)round($pct * $width); + $bar = str_repeat('█', $filled) . str_repeat('░', $width - $filled); + return '[' . $bar . '] ' . str_pad((int)round($pct * 100), 3) . "% ($current/$total)"; +} + +function processDuplicateGroups(array $groups, array $header, array $rows, array &$rowsToDelete, string $stepLabel): void +{ + $total = count($groups); + $current = 0; + + if ($total === 0) { + echo "Keine gefunden.\n"; + return; + } + + echo "$total Gruppe(n) gefunden.\n"; + + foreach ($groups as $group) { + $current++; + $groupRows = array_values(array_filter($group['rows'], fn($r) => !isset($rowsToDelete[$r]))); + if (count($groupRows) < 2) continue; + + echo "\n" . progressBar($current, $total) . " – $stepLabel\n"; + askDelete($groupRows, $header, $rows, $group['reasons'], $rowsToDelete); + } + + echo "\n" . progressBar($total, $total) . " – $stepLabel abgeschlossen.\n"; +} + // ------------------------------------------------------- // SCHRITT 1: E-Mail-Duplikate // ------------------------------------------------------- -$emailDuplicates = array_filter($emailMap, fn($entry) => count($entry['rows']) > 1); -$rowsToDelete = []; - echo "\n========================================\n"; -echo " SCHRITT 1: Doppelte E-Mail-Adressen\n"; +echo " SCHRITT 1: E-Mail-Duplikate\n"; echo "========================================\n"; -if (empty($emailDuplicates)) { - echo "Keine gefunden.\n"; -} else { - echo count($emailDuplicates) . " doppelte E-Mail-Adresse(n) gefunden.\n"; +$emailMap = []; +foreach ($rows as $rowNum => $row) { + $email = mb_strtolower(trim($row[COL_EMAIL] ?? '')); + if ($email !== '') { + $emailMap[$email][] = $rowNum; + } +} + +$autoDeleteGroups = []; +$manualReviewGroups = []; - foreach ($emailDuplicates as $data) { - askDelete($data, 'E-Mail', $header, $rows, $rowsToDelete); +foreach ($emailMap as $email => $rowNums) { + if (count($rowNums) < 2) continue; + + // Prüfen ob alle Einträge gleichen Vor- + Nachnamen haben + $autoDelete = true; + $firstFirst = normalizeName(firstToken($rows[$rowNums[0]][COL_FIRSTNAME] ?? '')); + $firstLast = normalizeName($rows[$rowNums[0]][COL_LASTNAME] ?? ''); + + foreach ($rowNums as $rowNum) { + $first = normalizeName(firstToken($rows[$rowNum][COL_FIRSTNAME] ?? '')); + $last = normalizeName($rows[$rowNum][COL_LASTNAME] ?? ''); + if (!isSimilar($first, $firstFirst) || !isSimilar($last, $firstLast)) { + $autoDelete = false; + break; + } + } + + if ($autoDelete) { + $autoDeleteGroups[] = ['rows' => $rowNums, 'reasons' => ["gleiche E-Mail + gleicher Name ($email)"]]; + } else { + $manualReviewGroups[] = ['rows' => $rowNums, 'reasons' => ["E-Mail: $email"]]; } } +$rowsToDelete = []; +$autoDeleteCount = 0; + +// Auto-Delete: gleiche E-Mail + gleicher Name +if (!empty($autoDeleteGroups)) { + echo "\n--- Auto-Delete: gleiche E-Mail + gleicher Name ---\n"; + foreach ($autoDeleteGroups as $group) { + $keep = $group['rows'][0]; + $delete = array_slice($group['rows'], 1); + + printBlock($group['rows'], $header, $rows, $group['reasons']); + echo "Behalten: Zeile $keep – Auto-gelöscht: Zeilen " . implode(', ', $delete) . "\n"; + + foreach ($delete as $rowNum) { + $rowsToDelete[$rowNum] = true; + $autoDeleteCount++; + } + } + echo "\n========================================\n"; + echo " Auto-Delete Zusammenfassung\n"; + echo "========================================\n"; + echo "Gruppen: " . count($autoDeleteGroups) . "\n"; + echo "Gelöschte Zeilen: $autoDeleteCount\n"; + echo "Zeilen: " . implode(', ', array_keys($rowsToDelete)) . "\n"; +} + +// Manuelle Prüfung: gleiche E-Mail aber unterschiedliche Namen +if (!empty($manualReviewGroups)) { + echo "\n--- Manuelle Prüfung: gleiche E-Mail, unterschiedliche Namen ---\n"; + processDuplicateGroups($manualReviewGroups, $header, $rows, $rowsToDelete, 'E-Mail'); +} else { + echo "\nKeine manuellen E-Mail-Duplikate.\n"; +} + // ------------------------------------------------------- -// SCHRITT 2: Straßen-Duplikate (bereits gelöschte Zeilen ausblenden) +// SCHRITT 2: Mehrfachanmeldungen // ------------------------------------------------------- echo "\n========================================\n"; -echo " SCHRITT 2: Doppelte Straßen\n"; +echo " SCHRITT 2: Mehrfachanmeldungen\n"; echo "========================================\n"; -foreach ($streetMap as $key => $data) { - $streetMap[$key]['rows'] = array_values(array_filter($data['rows'], fn($r) => !isset($rowsToDelete[$r]))); - if (count($streetMap[$key]['rows']) < 2) { - unset($streetMap[$key]); +$byLastname = []; +foreach ($rows as $rowId => $row) { + if (isset($rowsToDelete[$rowId])) continue; + $key = mb_substr(normalizeName($row[COL_LASTNAME] ?? ''), 0, 3); + if ($key !== '') { + $byLastname[$key][] = $rowId; } } -$streetDuplicates = array_filter($streetMap, fn($entry) => count($entry['rows']) > 1); +$duplicateGroups = []; +$checked = []; +$bucketCount = count($byLastname); +$bucketDone = 0; -if (empty($streetDuplicates)) { - echo "Keine gefunden.\n"; -} else { - echo count($streetDuplicates) . " doppelte Straße(n) gefunden.\n"; +foreach ($byLastname as $bucket) { + $bucketDone++; + if ($bucketDone % 20 === 0) { + echo "\r" . progressBar($bucketDone, $bucketCount, 40) . " Analysiere... "; + } - foreach ($streetDuplicates as $data) { - askDelete($data, 'Straße', $header, $rows, $rowsToDelete); + for ($i = 0; $i < count($bucket); $i++) { + for ($j = $i + 1; $j < count($bucket); $j++) { + $idA = $bucket[$i]; + $idB = $bucket[$j]; + $key = "$idA-$idB"; + if (isset($checked[$key])) continue; + $checked[$key] = true; + + $reasons = isDuplicate($rows[$idA], $rows[$idB]); + if ($reasons === false) continue; + + $merged = false; + foreach ($duplicateGroups as &$group) { + if (in_array($idA, $group['rows']) || in_array($idB, $group['rows'])) { + if (!in_array($idA, $group['rows'])) $group['rows'][] = $idA; + if (!in_array($idB, $group['rows'])) $group['rows'][] = $idB; + $group['reasons'] = array_unique(array_merge($group['reasons'], $reasons)); + $merged = true; + break; + } + } + unset($group); + + if (!$merged) { + $duplicateGroups[] = ['rows' => [$idA, $idB], 'reasons' => $reasons]; + } + } } } +echo "\r" . progressBar($bucketCount, $bucketCount, 40) . " Analyse abgeschlossen.\n\n"; + +processDuplicateGroups($duplicateGroups, $header, $rows, $rowsToDelete, 'Mehrfachanmeldung'); + // ------------------------------------------------------- -// CSV schreiben falls Zeilen zum Löschen vorgemerkt +// CSV schreiben // ------------------------------------------------------- if (empty($rowsToDelete)) { echo "\nKeine Zeilen zum Löschen vorgemerkt. Keine Ausgabedatei erstellt.\n"; @@ -163,23 +366,24 @@ if (empty($rowsToDelete)) { } $outputDir = __DIR__ . '/output-csv'; -$outputPath = $outputDir . '/' . $filename; +$outputPath = $outputDir . '/clean_' . $filename; if (!is_dir($outputDir)) { mkdir($outputDir, 0755, true); } $out = fopen($outputPath, 'w'); -fputcsv($out, $header, $delimiter); + +$headerIso = array_map(fn($cell) => mb_convert_encoding($cell, 'ISO-8859-1', 'UTF-8'), $header); +fputcsv($out, $headerIso, $delimiter); foreach ($rows as $rowNum => $row) { - if (isset($rowsToDelete[$rowNum])) { - continue; - } - fputcsv($out, $row, $delimiter); + if (isset($rowsToDelete[$rowNum])) continue; + $rowIso = array_map(fn($cell) => mb_convert_encoding($cell, 'ISO-8859-1', 'UTF-8'), $row); + fputcsv($out, $rowIso, $delimiter); } fclose($out); -echo "\nBereinigte CSV gespeichert: output-csv/$filename\n"; +echo "\nBereinigte CSV gespeichert: output-csv/clean_$filename\n"; echo "Gelöschte Zeilen gesamt: " . count($rowsToDelete) . "\n"; \ No newline at end of file