ETL vs ELT
๊ฐ์
ETL(Extract, Transform, Load)๊ณผ ELT(Extract, Load, Transform)๋ ๋ฐ์ดํฐ ํ์ดํ๋ผ์ธ์ ๋ ๊ฐ์ง ์ฃผ์ ํจํด์
๋๋ค. ์ ํต์ ์ธ ETL์ ๋ณํ ํ ์ ์ฌํ๊ณ , ๋ชจ๋ ELT๋ ์ ์ฌ ํ ๋ณํํฉ๋๋ค.
1.1 ETL ํ๋ก์ธ์ค
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ ETL Process โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ โโโโโโโโโโโโ โโโโโโโโโโโโโโโโ โโโโโโโโโโโโ โ
โ โ Sources โ โ โ ETL Server โ โ โ Target โ โ
โ โ โ โ โ โ (DW) โ โ
โ โ - DB โ โ 1. Extract โ โ โ โ
โ โ - Files โ โ 2. Transform โ โ Clean โ โ
โ โ - APIs โ โ 3. Load โ โ Data โ โ
โ โโโโโโโโโโโโ โโโโโโโโโโโโโโโโ โโโโโโโโโโโโ โ
โ โ
โ ๋ณํ์ด ์ค๊ฐ ์๋ฒ์์ ์ํ๋จ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1.2 ETL ์์ ์ฝ๋
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime
class ETLPipeline:
"""์ ํต์ ์ธ ETL ํ์ดํ๋ผ์ธ"""
def __init__(self, source_conn: str, target_conn: str):
self.source_engine = create_engine(source_conn)
self.target_engine = create_engine(target_conn)
def extract(self, query: str) -> pd.DataFrame:
"""
Extract: ์์ค์์ ๋ฐ์ดํฐ ์ถ์ถ
"""
print(f"[Extract] Starting at {datetime.now()}")
df = pd.read_sql(query, self.source_engine)
print(f"[Extract] Extracted {len(df)} rows")
return df
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform: ๋ฐ์ดํฐ ์ ์ ๋ฐ ๋ณํ
- ์ด ๋จ๊ณ๊ฐ ETL ์๋ฒ์์ ์ํ๋จ (๋ฆฌ์์ค ์๋ชจ)
"""
print(f"[Transform] Starting at {datetime.now()}")
# 1. ๊ฒฐ์ธก์น ์ฒ๋ฆฌ
df = df.dropna(subset=['customer_id', 'amount'])
df['email'] = df['email'].fillna('unknown@example.com')
# 2. ๋ฐ์ดํฐ ํ์
๋ณํ
df['order_date'] = pd.to_datetime(df['order_date'])
df['amount'] = df['amount'].astype(float)
# 3. ํ์ ์ปฌ๋ผ ์์ฑ
df['year'] = df['order_date'].dt.year
df['month'] = df['order_date'].dt.month
df['day_of_week'] = df['order_date'].dt.dayofweek
# 4. ๋น์ฆ๋์ค ๋ก์ง ์ ์ฉ
df['customer_segment'] = df['total_purchases'].apply(
lambda x: 'Gold' if x > 10000 else ('Silver' if x > 5000 else 'Bronze')
)
# 5. ๋ฐ์ดํฐ ํ์ง ๊ฒ์ฆ
assert df['amount'].min() >= 0, "Negative amounts found"
print(f"[Transform] Transformed {len(df)} rows")
return df
def load(self, df: pd.DataFrame, table_name: str):
"""
Load: ํ๊ฒ ๋ฐ์ดํฐ ์จ์ดํ์ฐ์ค์ ์ ์ฌ
"""
print(f"[Load] Starting at {datetime.now()}")
# Full refresh (ํ
์ด๋ธ ๊ต์ฒด)
df.to_sql(
table_name,
self.target_engine,
if_exists='replace',
index=False,
chunksize=10000
)
print(f"[Load] Loaded {len(df)} rows to {table_name}")
def run(self, source_query: str, target_table: str):
"""ETL ํ์ดํ๋ผ์ธ ์คํ"""
start_time = datetime.now()
print(f"ETL Pipeline started at {start_time}")
# E-T-L ์์๋ก ์คํ
raw_data = self.extract(source_query)
transformed_data = self.transform(raw_data)
self.load(transformed_data, target_table)
end_time = datetime.now()
print(f"ETL Pipeline completed in {(end_time - start_time).seconds} seconds")
# ์ฌ์ฉ ์์
if __name__ == "__main__":
pipeline = ETLPipeline(
source_conn="postgresql://user:pass@source-db:5432/sales",
target_conn="postgresql://user:pass@warehouse:5432/analytics"
)
pipeline.run(
source_query="""
SELECT
o.order_id,
o.customer_id,
c.email,
o.order_date,
o.amount,
c.total_purchases
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE o.order_date >= CURRENT_DATE - INTERVAL '1 day'
""",
target_table="fact_daily_orders"
)
1.3 ETL ๋๊ตฌ
| ๋๊ตฌ |
์ ํ |
ํน์ง |
| Informatica |
์์ฉ |
์ํฐํ๋ผ์ด์ฆ๊ธ, GUI ๊ธฐ๋ฐ |
| Talend |
์คํ์์ค/์์ฉ |
Java ๊ธฐ๋ฐ, ๋ค์ํ ์ปค๋ฅํฐ |
| SSIS |
์์ฉ (MS) |
SQL Server ํตํฉ |
| Pentaho |
์คํ์์ค |
๊ฒฝ๋, ์ฌ์ฉ ํธ์ |
| Apache NiFi |
์คํ์์ค |
๋ฐ์ดํฐ ํ๋ก์ฐ, ์ค์๊ฐ |
2.1 ELT ํ๋ก์ธ์ค
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ ELT Process โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ โโโโโโโโโโโโ โโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโ โ
โ โ Sources โ โ โ Load Raw โ โ โ Transform โ โ
โ โ โ โ (Data Lake) โ โ (in DW) โ โ
โ โ - DB โ โ โ โ โ โ
โ โ - Files โ โ Raw Zone โ โ SQL/Spark โ โ
โ โ - APIs โ โ (as-is) โ โ (DW ๋ฆฌ์์ค) โ โ
โ โโโโโโโโโโโโ โโโโโโโโโโโโโโโโ โโโโโโโโโโโโโโโโ โ
โ โ
โ ๋ณํ์ด ํ๊ฒ ์์คํ
(DW/Lake)์์ ์ํ๋จ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
2.2 ELT ์์ ์ฝ๋
import pandas as pd
from datetime import datetime
class ELTPipeline:
"""๋ชจ๋ ELT ํ์ดํ๋ผ์ธ"""
def __init__(self, source_conn: str, warehouse_conn: str):
self.source_conn = source_conn
self.warehouse_conn = warehouse_conn
def extract_and_load(self, source_query: str, raw_table: str):
"""
Extract & Load: ์๋ณธ ๋ฐ์ดํฐ๋ฅผ ๊ทธ๋๋ก ์ ์ฌ
- ๋ณํ ์์ด raw ๋ฐ์ดํฐ๋ฅผ ๋น ๋ฅด๊ฒ ์ ์ฌ
"""
print(f"[Extract & Load] Starting at {datetime.now()}")
# ์์ค์์ ๋ฐ์ดํฐ ์ถ์ถ
df = pd.read_sql(source_query, self.source_conn)
# Raw ํ
์ด๋ธ์ ๊ทธ๋๋ก ์ ์ฌ (๋ณํ ์์)
df.to_sql(
raw_table,
self.warehouse_conn,
if_exists='replace',
index=False
)
print(f"[Extract & Load] Loaded {len(df)} rows to {raw_table}")
def transform_in_warehouse(self, transform_sql: str):
"""
Transform: ์จ์ดํ์ฐ์ค ๋ด์์ SQL๋ก ๋ณํ
- DW์ ์ปดํจํ
ํ์ ํ์ฉ
- SQL ๊ธฐ๋ฐ ๋ณํ (dbt ๋ฑ ์ฌ์ฉ)
"""
print(f"[Transform] Starting at {datetime.now()}")
# ์จ์ดํ์ฐ์ค์์ SQL ์คํ
with self.warehouse_conn.connect() as conn:
conn.execute(transform_sql)
print(f"[Transform] Transformation completed")
# dbt ๋ชจ๋ธ ์์ (SQL ๊ธฐ๋ฐ ๋ณํ)
DBT_MODEL_EXAMPLE = """
-- models/staging/stg_orders.sql
-- dbt๋ฅผ ์ฌ์ฉํ ELT ๋ณํ
WITH source AS (
SELECT * FROM {{ source('raw', 'orders_raw') }}
),
cleaned AS (
SELECT
order_id,
customer_id,
COALESCE(email, 'unknown@example.com') AS email,
CAST(order_date AS DATE) AS order_date,
CAST(amount AS DECIMAL(10, 2)) AS amount,
total_purchases,
-- ํ์ ์ปฌ๋ผ
EXTRACT(YEAR FROM order_date) AS order_year,
EXTRACT(MONTH FROM order_date) AS order_month,
EXTRACT(DOW FROM order_date) AS day_of_week,
-- ๋น์ฆ๋์ค ๋ก์ง
CASE
WHEN total_purchases > 10000 THEN 'Gold'
WHEN total_purchases > 5000 THEN 'Silver'
ELSE 'Bronze'
END AS customer_segment,
-- ๋ฉํ๋ฐ์ดํฐ
CURRENT_TIMESTAMP AS loaded_at
FROM source
WHERE customer_id IS NOT NULL
AND amount IS NOT NULL
AND amount >= 0
)
SELECT * FROM cleaned
"""
# ์ค์ ELT ํ์ดํ๋ผ์ธ (Snowflake/BigQuery ์คํ์ผ)
class ModernELTWithSQL:
"""SQL ๊ธฐ๋ฐ ๋ชจ๋ ELT"""
def __init__(self, warehouse):
self.warehouse = warehouse
def extract_load(self, source: str, target_raw: str):
"""์๋ณธ โ Raw ๋ ์ด์ด"""
copy_sql = f"""
COPY INTO {target_raw}
FROM @{source}
FILE_FORMAT = (TYPE = 'PARQUET')
"""
self.warehouse.execute(copy_sql)
def transform_staging(self):
"""Raw โ Staging ๋ ์ด์ด"""
staging_sql = """
CREATE OR REPLACE TABLE staging.orders AS
SELECT
order_id,
customer_id,
PARSE_JSON(raw_data):email::STRING AS email,
TO_DATE(raw_data:order_date) AS order_date,
raw_data:amount::NUMBER(10,2) AS amount
FROM raw.orders_raw
"""
self.warehouse.execute(staging_sql)
def transform_mart(self):
"""Staging โ Mart ๋ ์ด์ด"""
mart_sql = """
CREATE OR REPLACE TABLE mart.fact_orders AS
SELECT
o.order_id,
d.date_sk,
c.customer_sk,
o.amount,
-- ์ง๊ณ
SUM(o.amount) OVER (
PARTITION BY o.customer_id
ORDER BY o.order_date
) AS cumulative_amount
FROM staging.orders o
JOIN dim_date d ON o.order_date = d.full_date
JOIN dim_customer c ON o.customer_id = c.customer_id
"""
self.warehouse.execute(mart_sql)
2.3 ELT ๋๊ตฌ
| ๋๊ตฌ |
์ ํ |
ํน์ง |
| dbt |
์คํ์์ค |
SQL ๊ธฐ๋ฐ ๋ณํ, ํ
์คํธ, ๋ฌธ์ํ |
| Fivetran |
์์ฉ |
์๋ ์คํค๋ง ๊ด๋ฆฌ, 150+ ์ปค๋ฅํฐ |
| Airbyte |
์คํ์์ค |
์ปค์คํ
์ปค๋ฅํฐ, EL ํนํ |
| Stitch |
์์ฉ |
๊ฐํธํ ์ค์ , SaaS ์นํ์ |
| AWS Glue |
ํด๋ผ์ฐ๋ |
์๋ฒ๋ฆฌ์ค, Spark ๊ธฐ๋ฐ |
3. ETL vs ELT ๋น๊ต
3.1 ์์ธ ๋น๊ต
| ํน์ฑ |
ETL |
ELT |
| ๋ณํ ์์น |
์ค๊ฐ ์๋ฒ |
ํ๊ฒ ์์คํ
(DW/Lake) |
| ๋ฐ์ดํฐ ์ด๋ |
๋ณํ๋ ๋ฐ์ดํฐ๋ง |
์๋ณธ ๋ฐ์ดํฐ ์ ์ฒด |
| ์คํค๋ง |
๋ฏธ๋ฆฌ ์ ์ ํ์ |
์ ์ฐ (Schema-on-Read) |
| ์ฒ๋ฆฌ ์๋ |
๋๋ฆผ (์ค๊ฐ ์ฒ๋ฆฌ) |
๋น ๋ฆ (๋ณ๋ ฌ ์ฒ๋ฆฌ) |
| ๋น์ฉ |
๋ณ๋ ์ธํ๋ผ ํ์ |
DW ๋ฆฌ์์ค ์ฌ์ฉ |
| ์ ์ฐ์ฑ |
๋ฎ์ |
๋์ (์๋ณธ ๋ณด์กด) |
| ๋ณต์กํ ๋ณํ |
์ ํฉ |
์ ํ์ |
| ์ค์๊ฐ ์ฒ๋ฆฌ |
์ด๋ ค์ |
๋น๊ต์ ์ฉ์ด |
3.2 ์ ํ ๊ธฐ์ค
def choose_etl_or_elt(requirements: dict) -> str:
"""ETL/ELT ์ ํ ๊ฐ์ด๋"""
# ETL ์ ํธ ์ํฉ
etl_factors = [
requirements.get('data_privacy', False), # ๋ฏผ๊ฐ ๋ฐ์ดํฐ ๋ง์คํน ํ์
requirements.get('complex_transforms', False), # ๋ณต์กํ ๋น์ฆ๋์ค ๋ก์ง
requirements.get('legacy_systems', False), # ๋ ๊ฑฐ์ ์์คํ
์ฐ๋
requirements.get('small_data', False), # ์๊ท๋ชจ ๋ฐ์ดํฐ
]
# ELT ์ ํธ ์ํฉ
elt_factors = [
requirements.get('big_data', False), # ๋์ฉ๋ ๋ฐ์ดํฐ
requirements.get('cloud_dw', False), # ํด๋ผ์ฐ๋ DW ์ฌ์ฉ
requirements.get('data_lake', False), # ๋ฐ์ดํฐ ๋ ์ดํฌ ๊ตฌ์ถ
requirements.get('flexible_schema', False), # ์คํค๋ง ์ ์ฐ์ฑ ํ์
requirements.get('raw_data_access', False), # ์๋ณธ ๋ฐ์ดํฐ ์ ๊ทผ ํ์
requirements.get('sql_transforms', False), # SQL๋ก ๋ณํ ๊ฐ๋ฅ
]
etl_score = sum(etl_factors)
elt_score = sum(elt_factors)
if etl_score > elt_score:
return "ETL ๊ถ์ฅ"
elif elt_score > etl_score:
return "ELT ๊ถ์ฅ"
else:
return "ํ์ด๋ธ๋ฆฌ๋ ๊ณ ๋ ค"
# ์ฌ์ฉ ์์
project_requirements = {
'big_data': True,
'cloud_dw': True, # Snowflake, BigQuery
'sql_transforms': True,
'raw_data_access': True
}
recommendation = choose_etl_or_elt(project_requirements)
print(recommendation) # "ELT ๊ถ์ฅ"
4. ํ์ด๋ธ๋ฆฌ๋ ์ ๊ทผ๋ฒ
4.1 ETLT ํจํด
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ ETLT (Hybrid) Pattern โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ Sources โ [E] โ [T] โ [L] โ [T] โ Mart โ
โ โ โ โ
โ โ โ โ
โ Light Transform Heavy Transform โ
โ (๋ง์คํน, ๊ฒ์ฆ) (์ง๊ณ, ์กฐ์ธ) โ
โ ETL Server Data Warehouse โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
4.2 ํ์ด๋ธ๋ฆฌ๋ ๊ตฌํ ์์
class HybridPipeline:
"""ETL + ELT ํ์ด๋ธ๋ฆฌ๋ ํ์ดํ๋ผ์ธ"""
def __init__(self, source, staging_area, warehouse):
self.source = source
self.staging = staging_area
self.warehouse = warehouse
def extract_with_light_transform(self):
"""
E + Light T: ์ถ์ถํ๋ฉด์ ๊ฐ๋ฒผ์ด ๋ณํ ์ํ
- PII ๋ง์คํน (๊ฐ์ธ์ ๋ณด ๋ณดํธ)
- ๊ธฐ๋ณธ ๋ฐ์ดํฐ ํ์
๋ณํ
- ํ์ ํ๋ ๊ฒ์ฆ
"""
query = """
SELECT
order_id,
-- PII ๋ง์คํน (์์ค์์ ์ํ)
MD5(customer_email) AS customer_email_hash,
SUBSTRING(phone, 1, 3) || '****' || SUBSTRING(phone, -4) AS phone_masked,
-- ๊ธฐ๋ณธ ๋ณํ
CAST(order_date AS DATE) AS order_date,
CAST(amount AS DECIMAL(10, 2)) AS amount
FROM orders
WHERE amount IS NOT NULL
"""
return self.source.execute(query)
def load_to_staging(self, data):
"""L: ์คํ
์ด์ง ์์ญ์ ์ ์ฌ"""
self.staging.load(data, 'orders_staging')
def transform_in_warehouse(self):
"""
Heavy T: ์จ์ดํ์ฐ์ค์์ ๋ณต์กํ ๋ณํ
- ์กฐ์ธ
- ์ง๊ณ
- ์๋์ฐ ํจ์
"""
heavy_transform_sql = """
CREATE TABLE mart.order_analysis AS
SELECT
o.order_date,
c.customer_segment,
p.product_category,
COUNT(*) AS order_count,
SUM(o.amount) AS total_amount,
AVG(o.amount) AS avg_order_value,
-- ์๋์ฐ ํจ์ (DW์์ ํจ์จ์ )
SUM(o.amount) OVER (
PARTITION BY c.customer_segment
ORDER BY o.order_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS rolling_7day_amount
FROM orders_staging o
JOIN dim_customer c ON o.customer_id = c.customer_id
JOIN dim_product p ON o.product_id = p.product_id
GROUP BY o.order_date, c.customer_segment, p.product_category
"""
self.warehouse.execute(heavy_transform_sql)
def run(self):
"""ํ์ด๋ธ๋ฆฌ๋ ํ์ดํ๋ผ์ธ ์คํ"""
# Phase 1: ETL (Extract + Light Transform)
data = self.extract_with_light_transform()
# Phase 2: Load to staging
self.load_to_staging(data)
# Phase 3: ELT (Heavy Transform in DW)
self.transform_in_warehouse()
5. ์ค๋ฌด ์ฌ๋ก
5.1 ETL ์ฌ์ฉ ์ฌ๋ก
# ์ฌ๋ก 1: ๊ฐ์ธ์ ๋ณด ์ฒ๋ฆฌ (GDPR ์ค์)
class GDPRCompliantETL:
"""GDPR ์ค์ ETL - ๊ฐ์ธ์ ๋ณด ๋ง์คํน ํ ์ ์ฌ"""
def transform(self, df):
# ๋ฏผ๊ฐ ์ ๋ณด ๋ง์คํน (์ ์ฌ ์ ์ํ)
df['email'] = df['email'].apply(self.mask_email)
df['ssn'] = df['ssn'].apply(lambda x: 'XXX-XX-' + x[-4:])
df['credit_card'] = df['credit_card'].apply(lambda x: '**** **** **** ' + x[-4:])
# EU ์ธ ์ง์ญ์ผ๋ก ๋ฐ์ดํฐ ์ ์ก ์ ๋ณํ
df = df[df['consent_given'] == True]
return df
def mask_email(self, email):
if pd.isna(email):
return None
local, domain = email.split('@')
return local[:2] + '***@' + domain
# ์ฌ๋ก 2: ๋ ๊ฑฐ์ ์์คํ
ํตํฉ
class LegacySystemETL:
"""๋ ๊ฑฐ์ ๋ฉ์ธํ๋ ์ ๋ฐ์ดํฐ ํตํฉ"""
def transform(self, raw_data):
# ๊ณ ์ ๊ธธ์ด ๋ ์ฝ๋ ํ์ฑ
records = []
for line in raw_data.split('\n'):
record = {
'account_no': line[0:10].strip(),
'account_type': line[10:12],
'balance': int(line[12:24]) / 100, # ์์์ ๋ณํ
'status': 'A' if line[24:25] == '1' else 'I',
'date': self.parse_legacy_date(line[25:33])
}
records.append(record)
return pd.DataFrame(records)
def parse_legacy_date(self, date_str):
# YYYYMMDD โ YYYY-MM-DD
return f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"
5.2 ELT ์ฌ์ฉ ์ฌ๋ก
-- ์ฌ๋ก 1: dbt๋ฅผ ํ์ฉํ ์ด์ปค๋จธ์ค ๋ถ์
-- models/staging/stg_orders.sql
WITH raw_orders AS (
SELECT * FROM {{ source('raw', 'orders') }}
)
SELECT
order_id,
customer_id,
order_date,
status,
total_amount
FROM raw_orders
WHERE order_date IS NOT NULL
-- models/marts/core/fct_orders.sql
WITH orders AS (
SELECT * FROM {{ ref('stg_orders') }}
),
customers AS (
SELECT * FROM {{ ref('dim_customers') }}
),
products AS (
SELECT * FROM {{ ref('dim_products') }}
)
SELECT
o.order_id,
o.order_date,
c.customer_segment,
p.product_category,
o.total_amount,
-- DW์์ ์๋์ฐ ํจ์ ํ์ฉ
ROW_NUMBER() OVER (PARTITION BY o.customer_id ORDER BY o.order_date) AS order_sequence,
LAG(o.order_date) OVER (PARTITION BY o.customer_id ORDER BY o.order_date) AS prev_order_date
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
JOIN products p ON o.product_id = p.product_id
-- ์ฌ๋ก 2: BigQuery ELT
-- ๋์ฉ๋ ๋ก๊ทธ ๋ถ์ (์๋ฒ๋ฆฌ์ค ์ฒ๋ฆฌ)
CREATE OR REPLACE TABLE analytics.user_behavior AS
SELECT
user_id,
DATE(timestamp) AS event_date,
event_type,
COUNT(*) AS event_count,
COUNTIF(event_type = 'purchase') AS purchase_count,
SUM(CASE WHEN event_type = 'purchase' THEN revenue ELSE 0 END) AS total_revenue,
-- ์ธ์
๋ถ์ (๋ณต์กํ ์๋์ฐ ํจ์)
ARRAY_AGG(
STRUCT(timestamp, event_type, page_url)
ORDER BY timestamp
) AS event_sequence
FROM raw.events
WHERE DATE(timestamp) = CURRENT_DATE() - 1
GROUP BY user_id, DATE(timestamp), event_type;
6. ๋๊ตฌ ์ ํ ๊ฐ์ด๋
6.1 ๋ฐ์ดํฐ ๊ท๋ชจ๋ณ ๊ถ์ฅ ๋๊ตฌ
| ๋ฐ์ดํฐ ๊ท๋ชจ |
ETL ๋๊ตฌ |
ELT ๋๊ตฌ |
| ์๊ท๋ชจ (< 1GB) |
Python + Pandas |
dbt + PostgreSQL |
| ์ค๊ท๋ชจ (1GB-100GB) |
Airflow + Python |
dbt + Snowflake |
| ๋๊ท๋ชจ (> 100GB) |
Spark |
dbt + BigQuery/Databricks |
6.2 ์ํคํ
์ฒ๋ณ ๊ถ์ฅ
architecture_recommendations = {
"traditional_dw": {
"approach": "ETL",
"tools": ["Informatica", "Talend", "SSIS"],
"reason": "์คํค๋ง ์๊ฒฉ, ๋ณํ ํ ์ ์ฌ"
},
"cloud_dw": {
"approach": "ELT",
"tools": ["dbt", "Fivetran + dbt", "Airbyte + dbt"],
"reason": "DW ์ปดํจํ
ํ์ ํ์ฉ, ์๋ณธ ๋ณด์กด"
},
"data_lake": {
"approach": "ELT",
"tools": ["Spark", "AWS Glue", "Databricks"],
"reason": "์คํค๋ง ์ ์ฐ, ๋์ฉ๋ ์ฒ๋ฆฌ"
},
"hybrid": {
"approach": "ETLT",
"tools": ["Airflow + dbt", "Prefect + dbt"],
"reason": "๋ฏผ๊ฐ ์ ๋ณด ์ฒ๋ฆฌ + DW ๋ณํ"
}
}
์ฐ์ต ๋ฌธ์
๋ฌธ์ 1: ETL vs ELT ์ ํ
๋ค์ ์ํฉ์์ ETL๊ณผ ELT ์ค ์ด๋ค ๋ฐฉ์์ด ์ ํฉํ์ง ์ ํํ๊ณ ์ด์ ๋ฅผ ์ค๋ช
ํ์ธ์:
- ์ผ์ผ 100GB์ ๋ก๊ทธ ๋ฐ์ดํฐ๋ฅผ BigQuery์ ์ ์ฌ
- ๊ฐ์ธ์ ๋ณด๊ฐ ํฌํจ๋ ๊ณ ๊ฐ ๋ฐ์ดํฐ๋ฅผ ์ฒ๋ฆฌ
๋ฌธ์ 2: ELT SQL ์์ฑ
Raw ํ
์ด๋ธ raw_orders์์ ์ผ๋ณ ๋งค์ถ ์ง๊ณ ํ
์ด๋ธ์ ์์ฑํ๋ ELT SQL์ ์์ฑํ์ธ์.
์์ฝ
| ๊ฐ๋
|
์ค๋ช
|
| ETL |
๋ณํ ํ ์ ์ฌ, ์ค๊ฐ ์๋ฒ์์ ์ฒ๋ฆฌ |
| ELT |
์ ์ฌ ํ ๋ณํ, ํ๊ฒ ์์คํ
์์ ์ฒ๋ฆฌ |
| ETL ์ฅ์ |
๋ฐ์ดํฐ ํ์ง ๋ณด์ฅ, ๋ฏผ๊ฐ ์ ๋ณด ์ฒ๋ฆฌ |
| ELT ์ฅ์ |
๋น ๋ฅธ ์ ์ฌ, ์ ์ฐํ ์คํค๋ง, ์๋ณธ ๋ณด์กด |
| ํ์ด๋ธ๋ฆฌ๋ |
ETL + ELT ์กฐํฉ, ์ํฉ์ ๋ง๊ฒ ์ ํ |
์ฐธ๊ณ ์๋ฃ