Skip to content

SQL to Python Pandas

Filtering

Filter on one column

select
    *
from
    df
where
    field = value
df_result = df[df['field'] == value]

Filter on two columns with an and

select
    *
from
    df
where
    field1 = value1 and 
    field2 = value2
df_result = df[(df['field1'] == value2) & (df['field2'] == value2)]

Filter on a list

SELECT
    *
FROM
    df
WHERE
    field1 IN ('value1', 'value2')
AND
    field2 IN ('value3', 'value4');
df_result = df[(df['field1'].isin(['value1', 'value2'])) & (df['field2'].isin(['value3', 'value4']))]

Filter on the latest date

select
    *
from
    df
where
    datestamp = (select max(datestamp) from df)
df_result = df[df['datestamp'] == df['datestamp'].max()]

Filter on the latest date grouped by dimension

SELECT
    MAX(datestamp),
    dimension
FROM
    df
GROUP BY
    metric_id;
df_latest = df.merge(
    df.groupby('dimension', as_index=False).agg({'datestamp': 'max'}),
    on=['dimension', 'datestamp'],
    how='inner'
)

Joining

SELECT
    *
FROM
    df1
LEFT JOIN df2 ON
    df2.id2 = df1.id1
df_result = pd.merge(
    df1,
    df2,
    left_on="id1",
    right_on="id2",
    how="left"
)

Extracting

Retrieve a single distinct value

SELECT DISTINCT
    field
FROM
    df
df_result = df['field'].unique()

Retrieve multiple distinct values

SELECT DISTINCT
    field1,
    field2
FROM
    df
distinct_df = df[['field1', 'field2']].drop_duplicates()

Summarising

select
    id,
    datestamp,
    avg(score),
    avg(slo)
from
    df
group by
    id,
    datestamp
df_result = df.groupby(['id','datestamp'],as_index=True).agg({
    'score' : 'mean',
    'slo'   : 'mean'
}).reset_index()
df_result = ['id','datestamp', 'score','slo']

Calculated fields

select
    total_ok,
    total,
    total_ok / total as score
from
    df
df['score'] = df.apply(lambda row: row['total_ok'] / row['total'], axis=1)