|
|
|
@@ -19,8 +19,12 @@ if (!$handle) { |
|
|
|
exit(1); |
|
|
|
} |
|
|
|
|
|
|
|
const COL_STREET = 5; |
|
|
|
const COL_EMAIL = 7; |
|
|
|
const COL_FIRSTNAME = 3; |
|
|
|
const COL_LASTNAME = 4; |
|
|
|
const COL_STREET = 5; |
|
|
|
const COL_PLZORT = 6; |
|
|
|
const COL_EMAIL = 7; |
|
|
|
const COL_PHONE = 8; |
|
|
|
|
|
|
|
// Delimiter auto-erkennen |
|
|
|
$firstLine = fgets($handle); |
|
|
|
@@ -30,12 +34,9 @@ rewind($handle); |
|
|
|
|
|
|
|
echo "Erkannter Delimiter: '$delimiter'\n"; |
|
|
|
|
|
|
|
// Header einlesen |
|
|
|
$header = fgetcsv($handle, 0, $delimiter); |
|
|
|
$header = array_map(fn($cell) => mb_convert_encoding($cell, 'UTF-8', 'ISO-8859-1'), $header); |
|
|
|
|
|
|
|
$emailMap = []; |
|
|
|
$streetMap = []; |
|
|
|
$rows = []; |
|
|
|
$rowNumber = 1; |
|
|
|
|
|
|
|
@@ -43,19 +44,6 @@ while (($row = fgetcsv($handle, 0, $delimiter)) !== false) { |
|
|
|
$rowNumber++; |
|
|
|
$row = array_map(fn($cell) => mb_convert_encoding($cell, 'UTF-8', 'ISO-8859-1'), $row); |
|
|
|
$rows[$rowNumber] = $row; |
|
|
|
|
|
|
|
$email = mb_strtolower(trim($row[COL_EMAIL] ?? '')); |
|
|
|
$street = mb_strtolower(trim($row[COL_STREET] ?? '')); |
|
|
|
|
|
|
|
if ($email !== '') { |
|
|
|
$emailMap[$email]['original'] = trim($row[COL_EMAIL]); |
|
|
|
$emailMap[$email]['rows'][] = $rowNumber; |
|
|
|
} |
|
|
|
|
|
|
|
if ($street !== '') { |
|
|
|
$streetMap[$street]['original'] = trim($row[COL_STREET]); |
|
|
|
$streetMap[$street]['rows'][] = $rowNumber; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
fclose($handle); |
|
|
|
@@ -64,35 +52,127 @@ fclose($handle); |
|
|
|
// Hilfsfunktionen |
|
|
|
// ------------------------------------------------------- |
|
|
|
|
|
|
|
function printBlock(array $data, string $label, array $header, array $rows): void |
|
|
|
function normalizeUmlauts(string $str): string |
|
|
|
{ |
|
|
|
return str_replace( |
|
|
|
['ä', 'ö', 'ü', 'ß', 'Ä', 'Ö', 'Ü'], |
|
|
|
['ae', 'oe', 'ue', 'ss', 'ae', 'oe', 'ue'], |
|
|
|
$str |
|
|
|
); |
|
|
|
} |
|
|
|
|
|
|
|
function normalizeName(string $name): string |
|
|
|
{ |
|
|
|
return normalizeUmlauts(mb_strtolower(trim($name))); |
|
|
|
} |
|
|
|
|
|
|
|
function firstToken(string $name): string |
|
|
|
{ |
|
|
|
return preg_split('/\s+/', trim($name))[0] ?? ''; |
|
|
|
} |
|
|
|
|
|
|
|
function isSimilar(string $a, string $b, int $threshold = 80): bool |
|
|
|
{ |
|
|
|
similar_text($a, $b, $percent); |
|
|
|
return $percent >= $threshold; |
|
|
|
} |
|
|
|
|
|
|
|
function nameSimilar(string $a, string $b): bool |
|
|
|
{ |
|
|
|
return isSimilar(normalizeName($a), normalizeName($b)); |
|
|
|
} |
|
|
|
|
|
|
|
function firstNameSimilar(string $a, string $b): bool |
|
|
|
{ |
|
|
|
return isSimilar(normalizeName(firstToken($a)), normalizeName(firstToken($b))); |
|
|
|
} |
|
|
|
|
|
|
|
function extractPlz(string $plzOrt): string |
|
|
|
{ |
|
|
|
preg_match('/\d{4,5}/', $plzOrt, $matches); |
|
|
|
return $matches[0] ?? ''; |
|
|
|
} |
|
|
|
|
|
|
|
function extractOrt(string $plzOrt): string |
|
|
|
{ |
|
|
|
return mb_strtolower(trim(preg_replace('/^\d{4,5}\s*/', '', trim($plzOrt)))); |
|
|
|
} |
|
|
|
|
|
|
|
function normalizePhone(string $phone): string |
|
|
|
{ |
|
|
|
return preg_replace('/\D/', '', $phone); |
|
|
|
} |
|
|
|
|
|
|
|
function isDuplicate(array $rowA, array $rowB) |
|
|
|
{ |
|
|
|
if (!nameSimilar($rowA[COL_LASTNAME], $rowB[COL_LASTNAME])) return false; |
|
|
|
if (!firstNameSimilar($rowA[COL_FIRSTNAME], $rowB[COL_FIRSTNAME])) return false; |
|
|
|
|
|
|
|
$reasons = []; |
|
|
|
|
|
|
|
$plzA = extractPlz($rowA[COL_PLZORT] ?? ''); |
|
|
|
$plzB = extractPlz($rowB[COL_PLZORT] ?? ''); |
|
|
|
$ortA = extractOrt($rowA[COL_PLZORT] ?? ''); |
|
|
|
$ortB = extractOrt($rowB[COL_PLZORT] ?? ''); |
|
|
|
$strA = mb_strtolower(trim($rowA[COL_STREET] ?? '')); |
|
|
|
$strB = mb_strtolower(trim($rowB[COL_STREET] ?? '')); |
|
|
|
$phoneA = normalizePhone($rowA[COL_PHONE] ?? ''); |
|
|
|
$phoneB = normalizePhone($rowB[COL_PHONE] ?? ''); |
|
|
|
|
|
|
|
if ($plzA !== '' && $plzB !== '' && $plzA === $plzB) { |
|
|
|
$reasons[] = "gleiche PLZ ($plzA)"; |
|
|
|
} |
|
|
|
|
|
|
|
if ($ortA !== '' && $ortB !== '' && isSimilar($ortA, $ortB)) { |
|
|
|
$reasons[] = "ähnlicher Ort ($ortA ≈ $ortB)"; |
|
|
|
} |
|
|
|
|
|
|
|
if ($strA !== '' && $strB !== '' && isSimilar($strA, $strB)) { |
|
|
|
$reasons[] = "ähnliche Straße ($strA ≈ $strB)"; |
|
|
|
} |
|
|
|
|
|
|
|
if ($phoneA !== '' && $phoneB !== '' && $phoneA === $phoneB) { |
|
|
|
$reasons[] = "gleiche Telefonnummer ($phoneA)"; |
|
|
|
} |
|
|
|
|
|
|
|
if ($plzA === '' || $plzB === '') { |
|
|
|
$reasons[] = "PLZ/Ort fehlt bei einem Eintrag"; |
|
|
|
} |
|
|
|
|
|
|
|
if (empty($reasons)) return false; |
|
|
|
|
|
|
|
return $reasons; |
|
|
|
} |
|
|
|
|
|
|
|
function printBlock(array $rowNums, array $header, array $rows, array $reasons): void |
|
|
|
{ |
|
|
|
$colWidth = 22; |
|
|
|
$colWidth = 20; |
|
|
|
$separator = str_repeat('-', count($header) * ($colWidth + 3)) . "\n"; |
|
|
|
$headerLine = implode(' | ', array_map(fn($h) => str_pad(mb_substr($h, 0, $colWidth), $colWidth), $header)); |
|
|
|
|
|
|
|
echo "\n" . $separator; |
|
|
|
echo "$label: {$data['original']}\n"; |
|
|
|
echo "Grund: " . implode(', ', $reasons) . "\n"; |
|
|
|
echo $separator; |
|
|
|
echo "Zeile | $headerLine\n"; |
|
|
|
echo $separator; |
|
|
|
|
|
|
|
foreach ($data['rows'] as $rowNum) { |
|
|
|
foreach ($rowNums as $rowNum) { |
|
|
|
$cells = array_map( |
|
|
|
fn($cell) => str_pad(mb_substr($cell, 0, $colWidth), $colWidth), |
|
|
|
$rows[$rowNum] |
|
|
|
); |
|
|
|
echo " $rowNum | " . implode(' | ', $cells) . "\n"; |
|
|
|
echo " " . str_pad((string)$rowNum, 5) . "| " . implode(' | ', $cells) . "\n"; |
|
|
|
} |
|
|
|
|
|
|
|
echo $separator; |
|
|
|
} |
|
|
|
|
|
|
|
function askDelete(array $data, string $label, array $header, array $rows, array &$rowsToDelete): void |
|
|
|
function askDelete(array $rowNums, array $header, array $rows, array $reasons, array &$rowsToDelete): void |
|
|
|
{ |
|
|
|
printBlock($data, $label, $header, $rows); |
|
|
|
printBlock($rowNums, $header, $rows, $reasons); |
|
|
|
|
|
|
|
$keep = $data['rows'][0]; |
|
|
|
$delete = array_slice($data['rows'], 1); |
|
|
|
$keep = $rowNums[0]; |
|
|
|
$delete = array_slice($rowNums, 1); |
|
|
|
|
|
|
|
echo "Behalten: Zeile $keep – Löschen: Zeilen " . implode(', ', $delete) . "\n"; |
|
|
|
echo "[1] Löschen [Enter] Überspringen: "; |
|
|
|
@@ -108,54 +188,177 @@ function askDelete(array $data, string $label, array $header, array $rows, array |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
function progressBar(int $current, int $total, int $width = 30): string |
|
|
|
{ |
|
|
|
$pct = $total > 0 ? $current / $total : 1; |
|
|
|
$filled = (int)round($pct * $width); |
|
|
|
$bar = str_repeat('█', $filled) . str_repeat('░', $width - $filled); |
|
|
|
return '[' . $bar . '] ' . str_pad((int)round($pct * 100), 3) . "% ($current/$total)"; |
|
|
|
} |
|
|
|
|
|
|
|
function processDuplicateGroups(array $groups, array $header, array $rows, array &$rowsToDelete, string $stepLabel): void |
|
|
|
{ |
|
|
|
$total = count($groups); |
|
|
|
$current = 0; |
|
|
|
|
|
|
|
if ($total === 0) { |
|
|
|
echo "Keine gefunden.\n"; |
|
|
|
return; |
|
|
|
} |
|
|
|
|
|
|
|
echo "$total Gruppe(n) gefunden.\n"; |
|
|
|
|
|
|
|
foreach ($groups as $group) { |
|
|
|
$current++; |
|
|
|
$groupRows = array_values(array_filter($group['rows'], fn($r) => !isset($rowsToDelete[$r]))); |
|
|
|
if (count($groupRows) < 2) continue; |
|
|
|
|
|
|
|
echo "\n" . progressBar($current, $total) . " – $stepLabel\n"; |
|
|
|
askDelete($groupRows, $header, $rows, $group['reasons'], $rowsToDelete); |
|
|
|
} |
|
|
|
|
|
|
|
echo "\n" . progressBar($total, $total) . " – $stepLabel abgeschlossen.\n"; |
|
|
|
} |
|
|
|
|
|
|
|
// ------------------------------------------------------- |
|
|
|
// SCHRITT 1: E-Mail-Duplikate |
|
|
|
// ------------------------------------------------------- |
|
|
|
$emailDuplicates = array_filter($emailMap, fn($entry) => count($entry['rows']) > 1); |
|
|
|
$rowsToDelete = []; |
|
|
|
|
|
|
|
echo "\n========================================\n"; |
|
|
|
echo " SCHRITT 1: Doppelte E-Mail-Adressen\n"; |
|
|
|
echo " SCHRITT 1: E-Mail-Duplikate\n"; |
|
|
|
echo "========================================\n"; |
|
|
|
|
|
|
|
if (empty($emailDuplicates)) { |
|
|
|
echo "Keine gefunden.\n"; |
|
|
|
} else { |
|
|
|
echo count($emailDuplicates) . " doppelte E-Mail-Adresse(n) gefunden.\n"; |
|
|
|
$emailMap = []; |
|
|
|
foreach ($rows as $rowNum => $row) { |
|
|
|
$email = mb_strtolower(trim($row[COL_EMAIL] ?? '')); |
|
|
|
if ($email !== '') { |
|
|
|
$emailMap[$email][] = $rowNum; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
$autoDeleteGroups = []; |
|
|
|
$manualReviewGroups = []; |
|
|
|
|
|
|
|
foreach ($emailDuplicates as $data) { |
|
|
|
askDelete($data, 'E-Mail', $header, $rows, $rowsToDelete); |
|
|
|
foreach ($emailMap as $email => $rowNums) { |
|
|
|
if (count($rowNums) < 2) continue; |
|
|
|
|
|
|
|
// Prüfen ob alle Einträge gleichen Vor- + Nachnamen haben |
|
|
|
$autoDelete = true; |
|
|
|
$firstFirst = normalizeName(firstToken($rows[$rowNums[0]][COL_FIRSTNAME] ?? '')); |
|
|
|
$firstLast = normalizeName($rows[$rowNums[0]][COL_LASTNAME] ?? ''); |
|
|
|
|
|
|
|
foreach ($rowNums as $rowNum) { |
|
|
|
$first = normalizeName(firstToken($rows[$rowNum][COL_FIRSTNAME] ?? '')); |
|
|
|
$last = normalizeName($rows[$rowNum][COL_LASTNAME] ?? ''); |
|
|
|
if (!isSimilar($first, $firstFirst) || !isSimilar($last, $firstLast)) { |
|
|
|
$autoDelete = false; |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if ($autoDelete) { |
|
|
|
$autoDeleteGroups[] = ['rows' => $rowNums, 'reasons' => ["gleiche E-Mail + gleicher Name ($email)"]]; |
|
|
|
} else { |
|
|
|
$manualReviewGroups[] = ['rows' => $rowNums, 'reasons' => ["E-Mail: $email"]]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
$rowsToDelete = []; |
|
|
|
$autoDeleteCount = 0; |
|
|
|
|
|
|
|
// Auto-Delete: gleiche E-Mail + gleicher Name |
|
|
|
if (!empty($autoDeleteGroups)) { |
|
|
|
echo "\n--- Auto-Delete: gleiche E-Mail + gleicher Name ---\n"; |
|
|
|
foreach ($autoDeleteGroups as $group) { |
|
|
|
$keep = $group['rows'][0]; |
|
|
|
$delete = array_slice($group['rows'], 1); |
|
|
|
|
|
|
|
printBlock($group['rows'], $header, $rows, $group['reasons']); |
|
|
|
echo "Behalten: Zeile $keep – Auto-gelöscht: Zeilen " . implode(', ', $delete) . "\n"; |
|
|
|
|
|
|
|
foreach ($delete as $rowNum) { |
|
|
|
$rowsToDelete[$rowNum] = true; |
|
|
|
$autoDeleteCount++; |
|
|
|
} |
|
|
|
} |
|
|
|
echo "\n========================================\n"; |
|
|
|
echo " Auto-Delete Zusammenfassung\n"; |
|
|
|
echo "========================================\n"; |
|
|
|
echo "Gruppen: " . count($autoDeleteGroups) . "\n"; |
|
|
|
echo "Gelöschte Zeilen: $autoDeleteCount\n"; |
|
|
|
echo "Zeilen: " . implode(', ', array_keys($rowsToDelete)) . "\n"; |
|
|
|
} |
|
|
|
|
|
|
|
// Manuelle Prüfung: gleiche E-Mail aber unterschiedliche Namen |
|
|
|
if (!empty($manualReviewGroups)) { |
|
|
|
echo "\n--- Manuelle Prüfung: gleiche E-Mail, unterschiedliche Namen ---\n"; |
|
|
|
processDuplicateGroups($manualReviewGroups, $header, $rows, $rowsToDelete, 'E-Mail'); |
|
|
|
} else { |
|
|
|
echo "\nKeine manuellen E-Mail-Duplikate.\n"; |
|
|
|
} |
|
|
|
|
|
|
|
// ------------------------------------------------------- |
|
|
|
// SCHRITT 2: Straßen-Duplikate (bereits gelöschte Zeilen ausblenden) |
|
|
|
// SCHRITT 2: Mehrfachanmeldungen |
|
|
|
// ------------------------------------------------------- |
|
|
|
echo "\n========================================\n"; |
|
|
|
echo " SCHRITT 2: Doppelte Straßen\n"; |
|
|
|
echo " SCHRITT 2: Mehrfachanmeldungen\n"; |
|
|
|
echo "========================================\n"; |
|
|
|
|
|
|
|
foreach ($streetMap as $key => $data) { |
|
|
|
$streetMap[$key]['rows'] = array_values(array_filter($data['rows'], fn($r) => !isset($rowsToDelete[$r]))); |
|
|
|
if (count($streetMap[$key]['rows']) < 2) { |
|
|
|
unset($streetMap[$key]); |
|
|
|
$byLastname = []; |
|
|
|
foreach ($rows as $rowId => $row) { |
|
|
|
if (isset($rowsToDelete[$rowId])) continue; |
|
|
|
$key = mb_substr(normalizeName($row[COL_LASTNAME] ?? ''), 0, 3); |
|
|
|
if ($key !== '') { |
|
|
|
$byLastname[$key][] = $rowId; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
$streetDuplicates = array_filter($streetMap, fn($entry) => count($entry['rows']) > 1); |
|
|
|
$duplicateGroups = []; |
|
|
|
$checked = []; |
|
|
|
$bucketCount = count($byLastname); |
|
|
|
$bucketDone = 0; |
|
|
|
|
|
|
|
if (empty($streetDuplicates)) { |
|
|
|
echo "Keine gefunden.\n"; |
|
|
|
} else { |
|
|
|
echo count($streetDuplicates) . " doppelte Straße(n) gefunden.\n"; |
|
|
|
foreach ($byLastname as $bucket) { |
|
|
|
$bucketDone++; |
|
|
|
if ($bucketDone % 20 === 0) { |
|
|
|
echo "\r" . progressBar($bucketDone, $bucketCount, 40) . " Analysiere... "; |
|
|
|
} |
|
|
|
|
|
|
|
foreach ($streetDuplicates as $data) { |
|
|
|
askDelete($data, 'Straße', $header, $rows, $rowsToDelete); |
|
|
|
for ($i = 0; $i < count($bucket); $i++) { |
|
|
|
for ($j = $i + 1; $j < count($bucket); $j++) { |
|
|
|
$idA = $bucket[$i]; |
|
|
|
$idB = $bucket[$j]; |
|
|
|
$key = "$idA-$idB"; |
|
|
|
if (isset($checked[$key])) continue; |
|
|
|
$checked[$key] = true; |
|
|
|
|
|
|
|
$reasons = isDuplicate($rows[$idA], $rows[$idB]); |
|
|
|
if ($reasons === false) continue; |
|
|
|
|
|
|
|
$merged = false; |
|
|
|
foreach ($duplicateGroups as &$group) { |
|
|
|
if (in_array($idA, $group['rows']) || in_array($idB, $group['rows'])) { |
|
|
|
if (!in_array($idA, $group['rows'])) $group['rows'][] = $idA; |
|
|
|
if (!in_array($idB, $group['rows'])) $group['rows'][] = $idB; |
|
|
|
$group['reasons'] = array_unique(array_merge($group['reasons'], $reasons)); |
|
|
|
$merged = true; |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
unset($group); |
|
|
|
|
|
|
|
if (!$merged) { |
|
|
|
$duplicateGroups[] = ['rows' => [$idA, $idB], 'reasons' => $reasons]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
echo "\r" . progressBar($bucketCount, $bucketCount, 40) . " Analyse abgeschlossen.\n\n"; |
|
|
|
|
|
|
|
processDuplicateGroups($duplicateGroups, $header, $rows, $rowsToDelete, 'Mehrfachanmeldung'); |
|
|
|
|
|
|
|
// ------------------------------------------------------- |
|
|
|
// CSV schreiben falls Zeilen zum Löschen vorgemerkt |
|
|
|
// CSV schreiben |
|
|
|
// ------------------------------------------------------- |
|
|
|
if (empty($rowsToDelete)) { |
|
|
|
echo "\nKeine Zeilen zum Löschen vorgemerkt. Keine Ausgabedatei erstellt.\n"; |
|
|
|
@@ -163,23 +366,24 @@ if (empty($rowsToDelete)) { |
|
|
|
} |
|
|
|
|
|
|
|
$outputDir = __DIR__ . '/output-csv'; |
|
|
|
$outputPath = $outputDir . '/' . $filename; |
|
|
|
$outputPath = $outputDir . '/clean_' . $filename; |
|
|
|
|
|
|
|
if (!is_dir($outputDir)) { |
|
|
|
mkdir($outputDir, 0755, true); |
|
|
|
} |
|
|
|
|
|
|
|
$out = fopen($outputPath, 'w'); |
|
|
|
fputcsv($out, $header, $delimiter); |
|
|
|
|
|
|
|
$headerIso = array_map(fn($cell) => mb_convert_encoding($cell, 'ISO-8859-1', 'UTF-8'), $header); |
|
|
|
fputcsv($out, $headerIso, $delimiter); |
|
|
|
|
|
|
|
foreach ($rows as $rowNum => $row) { |
|
|
|
if (isset($rowsToDelete[$rowNum])) { |
|
|
|
continue; |
|
|
|
} |
|
|
|
fputcsv($out, $row, $delimiter); |
|
|
|
if (isset($rowsToDelete[$rowNum])) continue; |
|
|
|
$rowIso = array_map(fn($cell) => mb_convert_encoding($cell, 'ISO-8859-1', 'UTF-8'), $row); |
|
|
|
fputcsv($out, $rowIso, $delimiter); |
|
|
|
} |
|
|
|
|
|
|
|
fclose($out); |
|
|
|
|
|
|
|
echo "\nBereinigte CSV gespeichert: output-csv/$filename\n"; |
|
|
|
echo "\nBereinigte CSV gespeichert: output-csv/clean_$filename\n"; |
|
|
|
echo "Gelöschte Zeilen gesamt: " . count($rowsToDelete) . "\n"; |