3 min read

Offline Campaign Analysis Measurement: A journey through causal impact, geo-experimentation and synthetic control

In October 2022 I had the opportunity to give a talk at the Helsinki Data Science Meetup hosted by Wolt. Here I want to share the recording of my talk.

Abstract: The talk will show how to measure offline campaigns using causal inference techniques. In particular it’ll focus on tapping into the potential of synthetic control, geo-experiments via time-based regression, and Google’s Causal-Impact Method.


Code to generate data

You can find the raw data here and the code here.

import logging
from datetime import datetime

import numpy as np
import numpy.typing as npt
import pandas as pd
from rich.logging import RichHandler
from scipy.special import expit


class ZipCodeDataGenerator:
    """Class to generate data for a simulated geo-experiment of orders by zipcode.

    Orders are modeled as a binomial distribution over the zipcode population.
    The order rate is a function of a trend, seasonal components, the zipcode strength
    (unobserved variable) and the campaign effect for the treatment group.
    """

    def __init__(
        self,
        n_zipcodes: int,
        start_date: datetime,
        end_date: datetime,
        freq: str,
        start_campaign: datetime,
        rng: np.random.Generator,
    ) -> None:
        self.n_zipcodes = n_zipcodes
        self.start_date = start_date
        self.end_date = end_date
        self.freq = freq
        self.start_campaign = start_campaign
        self.rng = rng

    def generate_zipcodes_ids(self) -> npt.ArrayLike:
        return np.arange(start=0, stop=self.n_zipcodes, step=1)

    def generate_dates(self) -> npt.ArrayLike:
        return pd.date_range(start=self.start_date, end=self.end_date, freq=self.freq)

    def generate_base_df(
        self, zipcodes: npt.ArrayLike, date_range: npt.ArrayLike
    ) -> pd.DataFrame:
        data_df: pd.DataFrame = pd.merge(
            left=pd.DataFrame(data={"date": date_range}),
            right=pd.DataFrame(data={"zipcode": zipcodes}),
            how="cross",
        )
        data_df["is_campaign"] = data_df["date"] >= self.start_campaign
        return data_df

    def add_seasonality_features(self, data_df: pd.DataFrame) -> pd.DataFrame:
        data_df["is_weekend"] = data_df["date"].dt.weekday > 4
        data_df["is_weekday"] = ~data_df["is_weekend"]
        data_df["norm_trend"] = np.linspace(start=0, stop=1, num=data_df.shape[0])
        return data_df

    def generate_zipcodes_population(
        self, data_df: pd.DataFrame, zipcodes: npt.ArrayLike
    ) -> pd.DataFrame:
        zipcodes_population_map: dict[int, int] = {
            zipcode: int(10_000 * self.rng.gamma(shape=(zipcode % 3) + 1, scale=1))
            for zipcode in zipcodes
        }
        data_df["population"] = data_df["zipcode"].map(zipcodes_population_map)
        return data_df

    def generate_strength_feature(
        self, data_df: pd.DataFrame, zipcodes: npt.ArrayLike
    ) -> pd.DataFrame:
        zipcodes_strength_map: dict[int, int] = {
            zipcode: (zipcode < (self.n_zipcodes // 2)).astype(int)
            for zipcode in zipcodes
        }
        data_df["strength"] = data_df["zipcode"].map(zipcodes_strength_map)
        return data_df

    def generate_variant_tag(
        self, data_df: pd.DataFrame, zipcodes: npt.ArrayLike
    ) -> pd.DataFrame:
        zipcode_variant_map: dict[int, int] = {
            zipcode: (zipcode < (self.n_zipcodes // 3)).astype(int)
            for zipcode in zipcodes
        }
        data_df["variant"] = (
            data_df["zipcode"]
            .map(zipcode_variant_map)
            .map({0: "control", 1: "treatment"})
        )
        mask = data_df["is_campaign"] & (data_df["variant"] == "treatment")
        data_df["is_campaign_treatment"] = mask
        return data_df

    def generate_order_rate(self, data_df: pd.DataFrame) -> pd.DataFrame:

        base_or: dict[int, float] = {
            0: 0.4,
            1: 0.6,
        }  # base conversion rate depends on the strength level
        treatment_effect: float = 7e-2

        data_df["order_rate_true_logit"] = (
            data_df["strength"].map(base_or)
            + (data_df["is_campaign_treatment"] * treatment_effect)
            - 5e-2 * data_df["is_weekday"]
            + 6e-2 * data_df["norm_trend"]
            - 2.3
        )

        data_df["order_rate_true_logit_no_treatment"] = data_df[
            "order_rate_true_logit"
        ] - (data_df["is_campaign_treatment"] * treatment_effect)

        data_df["order_rate_true"] = expit(data_df["order_rate_true_logit"])
        data_df["order_rate_true_no_treatment"] = expit(
            data_df["order_rate_true_logit_no_treatment"]
        )
        return data_df

    def generate_orders(self, data_df: pd.DataFrame) -> pd.DataFrame:
        data_df["orders"] = rng.binomial(
            n=data_df["population"], p=data_df["order_rate_true"]
        )
        data_df["orders_no_treatment"] = rng.binomial(
            n=data_df["population"], p=data_df["order_rate_true_no_treatment"]
        )
        data_df["order_rate"] = data_df["orders"] / data_df["population"]
        data_df["order_rate_no_treatment"] = (
            data_df["orders_no_treatment"] / data_df["population"]
        )
        data_df["expected_orders"] = data_df["population"] * data_df["order_rate_true"]
        return data_df

    def run(self) -> pd.DataFrame:
        zipcodes = self.generate_zipcodes_ids()
        date_range = self.generate_dates()
        data_df = self.generate_base_df(zipcodes=zipcodes, date_range=date_range)
        data_df = self.add_seasonality_features(data_df=data_df)
        data_df = self.generate_zipcodes_population(data_df=data_df, zipcodes=zipcodes)
        data_df = self.generate_strength_feature(data_df=data_df, zipcodes=zipcodes)
        data_df = self.generate_variant_tag(data_df=data_df, zipcodes=zipcodes)
        data_df = self.generate_order_rate(data_df=data_df)
        data_df = self.generate_orders(data_df=data_df)
        return data_df


if __name__ == "__main__":

    FORMAT: str = "%(message)s"
    logging.basicConfig(
        level="INFO", format=FORMAT, datefmt="[%X]", handlers=[RichHandler()]
    )

    logging.info("Generating data...")
    seed: int = sum(map(ord, "wolt"))
    rng: np.random.Generator = np.random.default_rng(seed=seed)
    n_zipcodes: int = 100
    start_date: datetime = datetime(year=2022, month=4, day=1)
    end_date: datetime = datetime(year=2022, month=7, day=31)
    freq: str = "D"
    start_campaign: datetime = datetime(year=2022, month=7, day=1)

    zipcode_data_generator = ZipCodeDataGenerator(
        n_zipcodes=n_zipcodes,
        start_date=start_date,
        end_date=end_date,
        freq=freq,
        start_campaign=start_campaign,
        rng=rng,
    )

    data_df: pd.DataFrame = zipcode_data_generator.run()
    logging.info("Data generation complete!")

    data_df.to_csv("data/zipcodes_data.csv", index=False)
    logging.info("Data saved to data/zipcodes_data.csv")