SQL to Python Pandas
Filtering
Filter on one column
select
*
from
df
where
field = value
df_result = df[df['field'] == value]
Filter on two columns with an and
select
*
from
df
where
field1 = value1 and
field2 = value2
df_result = df[(df['field1'] == value2) & (df['field2'] == value2)]
Filter on a list
SELECT
*
FROM
df
WHERE
field1 IN ('value1', 'value2')
AND
field2 IN ('value3', 'value4');
df_result = df[(df['field1'].isin(['value1', 'value2'])) & (df['field2'].isin(['value3', 'value4']))]
Filter on the latest date
select
*
from
df
where
datestamp = (select max(datestamp) from df)
df_result = df[df['datestamp'] == df['datestamp'].max()]
Filter on the latest date grouped by dimension
SELECT
MAX(datestamp),
dimension
FROM
df
GROUP BY
metric_id;
df_latest = df.merge(
df.groupby('dimension', as_index=False).agg({'datestamp': 'max'}),
on=['dimension', 'datestamp'],
how='inner'
)
Joining
SELECT
*
FROM
df1
LEFT JOIN df2 ON
df2.id2 = df1.id1
df_result = pd.merge(
df1,
df2,
left_on="id1",
right_on="id2",
how="left"
)
Retrieve a single distinct value
SELECT DISTINCT
field
FROM
df
df_result = df['field'].unique()
Retrieve multiple distinct values
SELECT DISTINCT
field1,
field2
FROM
df
distinct_df = df[['field1', 'field2']].drop_duplicates()
Summarising
select
id,
datestamp,
avg(score),
avg(slo)
from
df
group by
id,
datestamp
df_result = df.groupby(['id','datestamp'],as_index=True).agg({
'score' : 'mean',
'slo' : 'mean'
}).reset_index()
df_result = ['id','datestamp', 'score','slo']
Calculated fields
select
total_ok,
total,
total_ok / total as score
from
df
df['score'] = df.apply(lambda row: row['total_ok'] / row['total'], axis=1)