USE_DB = True

# Basic Imports
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

# Database Imports
from sqlalchemy import create_engine
import psycopg2 

# Plotting Imports
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import warnings

# Stats/ML packages
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm
from sktime.forecasting.croston import Croston
from sktime.forecasting.statsforecast import StatsForecastADIDA

warnings.filterwarnings('ignore')

# Local Imports
from fns import get_table

if USE_DB:
    # Load Settings
    load_dotenv()
    
    POSTGRES_USER = os.getenv("POSTGRES_USER")
    POSTGRES_PW = os.getenv("POSTGRES_PW")
    engine = create_engine(f'postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PW}@localhost:5432/coke')    

    print("Successful import")

Successful import

if USE_DB:
    # Retrieve the data from the database
    df = get_table("consumption", engine)
else:
    # Retrieve from csv
    df = pd.read_csv("intake.csv")

# Order it, add the day of week, and display the first week
df = df.sort_values(by='date')
df["day"] = pd.to_datetime(df["date"]).dt.day_name()
df["diff"] = df["ounces"].diff()
df.head(7)

# Add 1-week and 1-month moving averages to the dataset
df['SMA_7'] = df['ounces'].rolling(window=7).mean()
df['SMA_30'] = df['ounces'].rolling(window=30).mean()

# View the raw data over time
fig_line = px.line(df, x="date", y="ounces")
fig_line.show()

# Create plotly figure
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=df['date'], y=df['ounces'], mode='lines', name='Original Value'))
fig.add_trace(go.Scatter(x=df['date'], y=df['SMA_7'], mode='lines', name='7-Period SMA'))
fig.add_trace(go.Scatter(x=df['date'], y=df['SMA_30'], mode='lines', name='30-Period SMA'))

# Update layout for better readability
fig.update_layout(
    title='Original Value vs. 7-Period and 30-Period Simple Moving Average',
    xaxis_title='Date',
    yaxis_title='Ounces'
)

# Show the plot
fig.show()

# Group the data by day of week and view the averages by day
df_day_group = df.groupby("day").mean(["ounces"])
df_day_group = df_day_group.loc[["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], :]

fig_scatter2 = px.scatter(df_day_group, x=df_day_group.index, y="ounces")
fig_scatter2.show()

fig = px.box(
    df,
    x="day",
    y="ounces"
)
fig.show()

# View the data over time
fig_line = px.line(df, x="date", y="diff")
fig_line.show()

dates = pd.date_range(start="2025-06-23", freq="D", periods=len(df['date']))

y_sparse = pd.Series(df["ounces"])
y_sparse.index = dates
y_sparse

2025-06-23     0.0
2025-06-24    40.0
2025-06-25     0.0
2025-06-26    30.0
2025-06-27    20.0
              ... 
2026-03-23     0.0
2026-03-24    25.0
2026-03-25    40.0
2026-03-26     0.0
2026-03-27    12.0
Freq: D, Name: ounces, Length: 278, dtype: float64

y_week = y_sparse.resample("W").sum()

plt.figure(figsize=(12, 6))
plt.plot(y_week, label='Actual Intake')
plt.title('Actual vs Predicted Soda Intake')
plt.xlabel('Date')
plt.ylabel('Ounces')
plt.legend()
plt.show()

y_month = y_sparse.resample("ME").sum()

plt.figure(figsize=(12, 6))
plt.plot(y_month, label='Actual Intake')
plt.title('Actual vs Predicted Soda Intake')
plt.xlabel('Date')
plt.ylabel('Ounces')
plt.legend()
plt.show()

y_2samp = df[["date", "ounces"]]
y_2samp["post_trip"] = (y_2samp["date"] > pd.to_datetime("2025-11-30").date()).astype(int)
y_2samp

# Predictor matrix
X = sm.add_constant(y_2samp["post_trip"])

# Outcome
y = y_2samp["ounces"]

# Fit OLS
model = sm.OLS(y, X)

# Newey-West HAC standard errors
# lag ≈ autocorrelation window (commonly 7 for daily data)
results = model.fit(
    cov_type="HAC",
    cov_kwds={"maxlags": 7}
)

print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 ounces   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     1.873
Date:                Sat, 28 Mar 2026   Prob (F-statistic):              0.172
Time:                        01:20:13   Log-Likelihood:                -1090.3
No. Observations:                 278   AIC:                             2185.
Df Residuals:                     276   BIC:                             2192.
Df Model:                           1                                         
Covariance Type:                  HAC                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.6894      1.035     10.330      0.000       8.661      12.718
post_trip     -1.9629      1.434     -1.368      0.171      -4.774       0.848
==============================================================================
Omnibus:                       34.950   Durbin-Watson:                   2.143
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               45.795
Skew:                           0.994   Prob(JB):                     1.14e-10
Kurtosis:                       2.984   Cond. No.                         2.47
==============================================================================

Notes:
[1] Standard Errors are heteroscedasticity and autocorrelation robust (HAC) using 7 lags and without small sample correction

y_arima = pd.Series(df["ounces"])
y_arima.index = dates

# In ARIMA(p,d,q), setting p=0 and d=0 makes it a pure MA(q) model.
# Using d=1 allows for the 1 level of diff-ing that we previous discussed as being appropriate for stationarity
ma_model = ARIMA(y_arima, order=(0, 1, 2))
ma_results = ma_model.fit()

# 3. Print the model summary for insight into parameters and fit
print(ma_results.summary())

                               SARIMAX Results                                
==============================================================================
Dep. Variable:                 ounces   No. Observations:                  278
Model:                 ARIMA(0, 1, 2)   Log Likelihood               -1088.903
Date:                Sat, 28 Mar 2026   AIC                           2183.806
Time:                        01:20:13   BIC                           2194.678
Sample:                    06-23-2025   HQIC                          2188.168
                         - 03-27-2026                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ma.L1         -1.0645      0.065    -16.320      0.000      -1.192      -0.937
ma.L2          0.1114      0.064      1.731      0.084      -0.015       0.238
sigma2       150.6789     12.934     11.650      0.000     125.330     176.028
===================================================================================
Ljung-Box (L1) (Q):                   0.01   Jarque-Bera (JB):                37.91
Prob(Q):                              0.93   Prob(JB):                         0.00
Heteroskedasticity (H):               1.06   Skew:                             0.90
Prob(H) (two-sided):                  0.78   Kurtosis:                         3.10
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

forecast_horizon = 12 # Forecast 12 periods into the future
forecast_results = ma_results.forecast(steps=forecast_horizon)
print("\nOut-of-sample forecasts:")
print(forecast_results)

Out-of-sample forecasts:
2026-03-28    10.930635
2026-03-29    10.952071
2026-03-30    10.952071
2026-03-31    10.952071
2026-04-01    10.952071
2026-04-02    10.952071
2026-04-03    10.952071
2026-04-04    10.952071
2026-04-05    10.952071
2026-04-06    10.952071
2026-04-07    10.952071
2026-04-08    10.952071
Freq: D, Name: predicted_mean, dtype: float64

# Using diffed ounces again as it allows models to learn patterns easier
df_lstm = df.loc[1:, ["date", "diff"]]
df_lstm['date'] = pd.to_datetime(df_lstm['date'])
df_lstm.set_index('date', inplace=True)
# production = data['Production'].astype(float).values.reshape(-1, 1)

scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(df_lstm)

def build_lstm(units1, units2, b_size):
    window_size = 30
    X = []
    y = []
    target_dates = df_lstm.index[window_size:]
    
    for i in range(window_size, len(scaled_data)):
        X.append(scaled_data[i - window_size:i, 0])
        y.append(scaled_data[i, 0])
    
    X = np.array(X)
    y = np.array(y)
    
    X_train, X_test, y_train, y_test, dates_train, dates_test = train_test_split(
        X, y, target_dates, test_size=0.2, shuffle=False
    )
    
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
    
    model = Sequential()
    model.add(LSTM(units=units1, return_sequences=True,
              input_shape=(X_train.shape[1], 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(units=units2))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    
    model.compile(optimizer='adam', loss='mean_squared_error')

    history = model.fit(
        X_train,
        y_train,
        epochs=100,
        batch_size=b_size,
        validation_split=0.1,
        verbose=0
    )
    
    predictions = model.predict(X_test)
    predictions = scaler.inverse_transform(predictions).flatten()
    y_test = scaler.inverse_transform(y_test.reshape(-1,1)).flatten()
    
    rmse = np.sqrt(np.mean((y_test - predictions)**2))
    print(f'RMSE: {rmse:.2f}')

    plt.figure(figsize=(12, 6))
    plt.plot(dates_test, y_test, label='Actual DoD Intake Diff')
    plt.plot(dates_test, predictions, label='Predicted DoD Intake Diff')
    plt.title('Actual vs Predicted Soda Intake Diff')
    plt.xlabel('Date')
    plt.ylabel('DoD Diff')
    plt.legend()
    plt.show()

build_lstm(32, 32, 32)

WARNING:tensorflow:TensorFlow GPU support is not available on native Windows for TensorFlow >= 2.11. Even if CUDA/cuDNN are installed, GPU will not be used. Please use WSL2 or the TensorFlow-DirectML plugin.
2/2 ━━━━━━━━━━━━━━━━━━━━ 0s 186ms/step
RMSE: 17.40

build_lstm(64, 64, 32)

2/2 ━━━━━━━━━━━━━━━━━━━━ 0s 197ms/step
RMSE: 14.98

build_lstm(128, 128, 32)

WARNING:tensorflow:5 out of the last 5 calls to <function TensorFlowTrainer.make_predict_function.<locals>.one_step_on_data_distributed at 0x0000022306776660> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
1/2 ━━━━━━━━━━━━━━━━━━━━ 0s 202ms/stepWARNING:tensorflow:6 out of the last 6 calls to <function TensorFlowTrainer.make_predict_function.<locals>.one_step_on_data_distributed at 0x0000022306776660> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
2/2 ━━━━━━━━━━━━━━━━━━━━ 0s 199ms/step
RMSE: 14.86

crouston_forecaster = Croston(smoothing=0.1)
y_crouston_pred = []

for i in range(50, len(y_sparse)):
    y_sparse_in = y_sparse[:i]
    crouston_forecaster.fit(y_sparse_in)
    y_crouston_pred.append(crouston_forecaster.predict(fh=[1]))

    # print(f"Predicted: {float(y_sparse_pred)}, Actual: {float(y_sparse[i])}, Last Actual: {float(y_sparse_in[-1])}")

y_crouston_pred = pd.concat(y_crouston_pred)
y_crouston_pred

2025-08-12     9.261765
2025-08-13     9.508091
2025-08-14     9.508091
2025-08-15     9.508091
2025-08-16     9.508091
                ...    
2026-03-23     9.122200
2026-03-24     9.122200
2026-03-25     9.403918
2026-03-26    10.758802
2026-03-27    10.758802
Freq: D, Name: ounces, Length: 228, dtype: float64

plt.figure(figsize=(12, 6))
plt.plot(y_sparse[50:], label='Actual Intake')
plt.plot(y_crouston_pred, label='Predicted Intake')
plt.title('Actual vs Predicted Soda Intake')
plt.xlabel('Date')
plt.ylabel('Ounces')
plt.legend()
plt.show()

adida_model = StatsForecastADIDA()
y_adida_pred = []

for i in range(50, len(y_sparse)):
    y_sparse_in = y_sparse[:i]
    y_adida_pred.append(adida_model.fit(y_sparse_in).predict(fh=[1]))

    # print(f"Predicted: {float(y_sparse_pred)}, Actual: {float(y_sparse[i])}, Last Actual: {float(y_sparse_in[-1])}")

y_adida_pred = pd.concat(y_adida_pred)
y_adida_pred

2025-08-12     7.040677
2025-08-13     9.393556
2025-08-14     8.528455
2025-08-15     6.575541
2025-08-16     6.007781
                ...    
2026-03-23     9.623155
2026-03-24     8.969309
2026-03-25     9.985199
2026-03-26    12.391184
2026-03-27    11.264532
Freq: D, Name: ounces, Length: 228, dtype: float64

plt.figure(figsize=(12, 6))
plt.plot(y_sparse[50:], label='Actual Intake')
plt.plot(y_adida_pred, label='Predicted Intake')
plt.title('Actual vs Predicted Soda Intake')
plt.xlabel('Date')
plt.ylabel('Ounces')
plt.legend()
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(y_sparse[50:], label='Actual Intake')
plt.plot(y_crouston_pred, label='Predicted Intake (Crouston)')
plt.plot(y_adida_pred, label='Predicted Intake (ADIDA)')
plt.title('Actual vs Predicted Soda Intake')
plt.xlabel('Date')
plt.ylabel('Ounces')
plt.legend()
plt.show()

crouston_rmse = np.sqrt(np.mean((y_sparse[50:] - y_crouston_pred)**2))
round(float(crouston_rmse), 2)

12.26

adida_rmse = np.sqrt(np.mean((y_sparse[50:] - y_adida_pred)**2))
round(float(adida_rmse), 2)

12.48

Soda Intake Analysis¶

Introduction¶

1. Setup¶

Settings¶

Imports¶

Data Connection¶

2. Data¶

Load¶

Adjustments¶

3. Exploratory Analysis¶

Transformations¶

Diff-ing¶

Resampling (Sparsity)¶

Weekly¶

Monthly¶

4. Predictive Analysis¶

Two-Sample Testing¶

Forecasting¶

ARIMA¶

Preprocessing¶

Model¶

Forecasting¶

LSTM¶

Preprocessing¶

Model & Predictions¶

Sparse Data¶

Croston's Method¶

ADIDA¶

Summary¶

	date	ounces	id	day	diff
6	2025-06-23	0.0	1	Monday	NaN
0	2025-06-24	40.0	2	Tuesday	40.0
7	2025-06-25	0.0	3	Wednesday	-40.0
1	2025-06-26	30.0	4	Thursday	30.0
2	2025-06-27	20.0	5	Friday	-10.0
3	2025-06-28	32.0	6	Saturday	12.0
4	2025-06-29	12.0	7	Sunday	-20.0