155 lines
4.3 KiB
TypeScript
155 lines
4.3 KiB
TypeScript
/**
|
|
* Import clinic_registry_working.csv into clinic_registry table.
|
|
*
|
|
* Usage:
|
|
* npx tsx scripts/import-registry.ts
|
|
*
|
|
* Requires env vars:
|
|
* SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY
|
|
*
|
|
* Or use .env.local file.
|
|
*/
|
|
|
|
import { createClient } from "@supabase/supabase-js";
|
|
import { readFileSync } from "fs";
|
|
import { config } from "dotenv";
|
|
|
|
config({ path: ".env.local" });
|
|
|
|
const SUPABASE_URL = process.env.SUPABASE_URL || process.env.VITE_SUPABASE_URL;
|
|
const SUPABASE_KEY = process.env.SUPABASE_SERVICE_ROLE_KEY;
|
|
|
|
if (!SUPABASE_URL || !SUPABASE_KEY) {
|
|
console.error("Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY");
|
|
process.exit(1);
|
|
}
|
|
|
|
const supabase = createClient(SUPABASE_URL, SUPABASE_KEY);
|
|
|
|
// CSV column indices (0-based)
|
|
const COL = {
|
|
name: 0,
|
|
brand_group: 1,
|
|
district: 2,
|
|
branches: 3,
|
|
website_kr: 4,
|
|
website_en: 5,
|
|
youtube_url: 6,
|
|
// 7: youtube_note
|
|
instagram_kr_url: 8,
|
|
// 9: instagram_kr_note
|
|
instagram_en_url: 10,
|
|
// 11: instagram_en_note
|
|
facebook_url: 12,
|
|
// 13: facebook_note
|
|
tiktok_url: 14,
|
|
// 15: tiktok_note
|
|
gangnam_unni_url: 16,
|
|
// 17: gangnam_unni_note
|
|
naver_blog_url: 18,
|
|
// 19: naver_blog_note
|
|
naver_place_url: 20,
|
|
// 21: naver_place_reviews_note
|
|
google_maps_url: 22,
|
|
// 23: google_reviews_note
|
|
} as const;
|
|
|
|
function extractDomain(url: string): string {
|
|
try {
|
|
return new URL(url).hostname.replace(/^www\./, "");
|
|
} catch {
|
|
// Handle URLs without protocol
|
|
const clean = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
|
return clean.split("/")[0];
|
|
}
|
|
}
|
|
|
|
function parseCSVLine(line: string): string[] {
|
|
// Simple CSV parser (no quoted fields with commas in this CSV)
|
|
return line.split(",");
|
|
}
|
|
|
|
async function main() {
|
|
const csv = readFileSync("data/clinic-registry/clinic_registry_working.csv", "utf8");
|
|
const lines = csv.split("\n").filter((l) => l.trim());
|
|
const rows = lines.slice(1); // skip header
|
|
|
|
console.log(`Parsing ${rows.length} clinics from CSV...`);
|
|
|
|
const records: Record<string, unknown>[] = [];
|
|
const skipped: string[] = [];
|
|
|
|
for (const line of rows) {
|
|
const cols = parseCSVLine(line);
|
|
const name = cols[COL.name]?.trim();
|
|
const website = cols[COL.website_kr]?.trim();
|
|
|
|
if (!name || !website) {
|
|
skipped.push(name || "(unnamed)");
|
|
continue;
|
|
}
|
|
|
|
const domain = extractDomain(website);
|
|
if (!domain) {
|
|
skipped.push(name);
|
|
continue;
|
|
}
|
|
|
|
records.push({
|
|
name,
|
|
name_aliases: [], // Can be enriched later
|
|
domain,
|
|
website_url: website,
|
|
brand_group: cols[COL.brand_group]?.trim() || null,
|
|
district: cols[COL.district]?.trim() || null,
|
|
branches: cols[COL.branches]?.trim() || null,
|
|
website_en: cols[COL.website_en]?.trim() || null,
|
|
youtube_url: cols[COL.youtube_url]?.trim() || null,
|
|
instagram_url: cols[COL.instagram_kr_url]?.trim() || null,
|
|
instagram_en_url: cols[COL.instagram_en_url]?.trim() || null,
|
|
facebook_url: cols[COL.facebook_url]?.trim() || null,
|
|
tiktok_url: cols[COL.tiktok_url]?.trim() || null,
|
|
gangnam_unni_url: cols[COL.gangnam_unni_url]?.trim() || null,
|
|
naver_blog_url: cols[COL.naver_blog_url]?.trim() || null,
|
|
naver_place_url: cols[COL.naver_place_url]?.trim() || null,
|
|
google_maps_url: cols[COL.google_maps_url]?.trim() || null,
|
|
verified_by: "scrape",
|
|
is_active: true,
|
|
});
|
|
}
|
|
|
|
console.log(`Prepared ${records.length} records (skipped ${skipped.length}: ${skipped.join(", ")})`);
|
|
|
|
// Upsert in batches of 20
|
|
const BATCH_SIZE = 20;
|
|
let inserted = 0;
|
|
let updated = 0;
|
|
let errors = 0;
|
|
|
|
for (let i = 0; i < records.length; i += BATCH_SIZE) {
|
|
const batch = records.slice(i, i + BATCH_SIZE);
|
|
const { data, error } = await supabase
|
|
.from("clinic_registry")
|
|
.upsert(batch, { onConflict: "domain" })
|
|
.select("id, domain");
|
|
|
|
if (error) {
|
|
console.error(`Batch ${i / BATCH_SIZE + 1} error:`, error.message);
|
|
errors += batch.length;
|
|
} else {
|
|
inserted += data?.length || 0;
|
|
console.log(`Batch ${i / BATCH_SIZE + 1}: ${data?.length} rows upserted`);
|
|
}
|
|
}
|
|
|
|
console.log(`\nDone! Inserted/updated: ${inserted}, Errors: ${errors}`);
|
|
|
|
// Verify
|
|
const { count } = await supabase
|
|
.from("clinic_registry")
|
|
.select("*", { count: "exact", head: true });
|
|
console.log(`Total rows in clinic_registry: ${count}`);
|
|
}
|
|
|
|
main().catch(console.error);
|