Applying Machine Learning Models in Lean Six Sigma methodology to optimize partner revenue velocity and reduce process waste.
Published on August 16, 2025 by Vimal Octavius PJ
lean six sigma process improvement partner revenue data analysis
7 min READ
Problem: Partner revenue velocity inconsistency
Current State Metrics:
Root Causes:
Solutions:
Control Plan:
This Lean Six Sigma analysis examines partner revenue velocity using synthetic data spanning 100 observations from 2022-2030. The study identifies key process inefficiencies and proposes targeted improvements.
Partner revenue velocity shows significant variation (0.12-0.44) with extended cycle times averaging 15.16 days, indicating process waste and inefficiency in the partner revenue generation system.
# ========================================================
# ML Model to Predict Partner Delivered Revenue (Synthetic Data)
# ========================================================
# Install missing packages
install.packages(c("corrplot", "caret", "gridExtra", "scales", "randomForest"))
# Load necessary libraries
library(tidyverse)
library(corrplot)
library(caret)
library(gridExtra)
library(scales)
library(randomForest)
# ========================================================
# 1. Generate Synthetic Data
# ========================================================
set.seed(42)
n_samples <- 100
# Generate synthetic data with realistic relationships
data <- tibble(
Month = seq(as.Date("2022-01-01"), by = "month", length.out = n_samples),
woCreate2CompleteDays = round(rnorm(n_samples, mean = 15, sd = 5)),
woaCreate2CompleteDays = round(rnorm(n_samples, mean = 12, sd = 4)),
PartnerBillableBooking = round(rnorm(n_samples, mean = 500000, sd = 150000)),
PartnerPipelineVelocity = round(rnorm(n_samples, mean = 0.25, sd = 0.08), 2),
POapprovalCycleTime = round(rnorm(n_samples, mean = 8, sd = 3)),
PartnerOppCount = round(rnorm(n_samples, mean = 25, sd = 8))
) %>%
# Generate revenue with realistic relationships to predictors
mutate(
PartnerDeliveredRevenue = round(
300000 +
PartnerBillableBooking * 0.8 +
PartnerPipelineVelocity * 2000000 +
PartnerOppCount * 15000 -
woCreate2CompleteDays * 8000 -
woaCreate2CompleteDays * 5000 -
POapprovalCycleTime * 12000 +
rnorm(n_samples, 0, 100000)
)
) %>%
# Ensure positive values
mutate(
woCreate2CompleteDays = pmax(woCreate2CompleteDays, 1),
woaCreate2CompleteDays = pmax(woaCreate2CompleteDays, 1),
PartnerBillableBooking = pmax(PartnerBillableBooking, 50000),
PartnerPipelineVelocity = pmax(PartnerPipelineVelocity, 0.05),
POapprovalCycleTime = pmax(POapprovalCycleTime, 1),
PartnerOppCount = pmax(PartnerOppCount, 5),
PartnerDeliveredRevenue = pmax(PartnerDeliveredRevenue, 100000)
)
# Display structure and summary
str(data)
summary(data)
# ========================================================
# 2. Exploratory Data Analysis
# ========================================================
my_colors <- c("#2C3E50", "#E74C3C", "#3498DB", "#2ECC71", "#F1C40F", "#9B59B6")
# Time series plot
p1 <- ggplot(data, aes(x = Month, y = PartnerDeliveredRevenue)) +
geom_line(color = "#2C3E50", size = 1) +
geom_point(color = "#E74C3C", size = 2) +
theme_minimal() +
labs(title = "Partner Delivered Revenue Over Time (Synthetic Data)",
x = "Month", y = "Revenue") +
scale_y_continuous(labels = function(x) paste0("$", format(x/1000000, digits = 1), "M")) +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1))
print(p1)
# Variable distributions
p2 <- data %>%
select(-Month) %>%
gather(key = "variable", value = "value") %>%
ggplot(aes(x = value)) +
geom_histogram(fill = "#3498DB", color = "#2C3E50", bins = 15) +
facet_wrap(~ variable, scales = "free") +
theme_minimal() +
labs(title = "Distribution of Variables", x = "", y = "Frequency") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
print(p2)
# Correlation matrix
correlation_matrix <- cor(data %>% select(-Month))
corrplot(correlation_matrix,
method = "color",
type = "upper",
tl.col = "black",
tl.cex = 0.8,
col = colorRampPalette(c("#E74C3C", "#FFFFFF", "#2ECC71"))(100),
title = "Correlation Matrix")
# ========================================================
# 3. Multiple Linear Regression Model
# ========================================================
set.seed(123)
trainIndex <- createDataPartition(data$PartnerDeliveredRevenue, p = .8, list = FALSE)
train_data <- data[trainIndex, ]
test_data <- data[-trainIndex, ]
# Build model
lm_model <- lm(PartnerDeliveredRevenue ~ woCreate2CompleteDays + woaCreate2CompleteDays +
PartnerBillableBooking + PartnerPipelineVelocity + POapprovalCycleTime +
PartnerOppCount, data = train_data)
summary(lm_model)
# Test predictions
predictions <- predict(lm_model, test_data)
test_rmse <- sqrt(mean((test_data$PartnerDeliveredRevenue - predictions)^2))
test_r2 <- cor(test_data$PartnerDeliveredRevenue, predictions)^2
cat("Test RMSE:", test_rmse, "\n")
cat("Test R-squared:", test_r2, "\n")
# ========================================================
# 4. Random Forest Model
# ========================================================
rf_model <- randomForest(
PartnerDeliveredRevenue ~ woCreate2CompleteDays + woaCreate2CompleteDays +
PartnerBillableBooking + PartnerPipelineVelocity +
POapprovalCycleTime + PartnerOppCount,
data = train_data,
ntree = 500,
importance = TRUE
)
print(rf_model)
# Variable importance
importance(rf_model)
varImpPlot(rf_model)
# Random Forest predictions
rf_predictions <- predict(rf_model, test_data)
rf_rmse <- sqrt(mean((test_data$PartnerDeliveredRevenue - rf_predictions)^2))
rf_r2 <- cor(test_data$PartnerDeliveredRevenue, rf_predictions)^2
cat("Random Forest Test RMSE:", rf_rmse, "\n")
cat("Random Forest Test R-squared:", rf_r2, "\n")
# ========================================================
# 5. Model Comparison
# ========================================================
# Compare predictions
comparison_df <- data.frame(
Actual = test_data$PartnerDeliveredRevenue,
Linear_Model = predictions,
Random_Forest = rf_predictions
)
# Prediction vs Actual plots
p_lm <- ggplot(comparison_df, aes(x = Actual, y = Linear_Model)) +
geom_point(color = "#3498DB", size = 2) +
geom_abline(intercept = 0, slope = 1, color = "#E74C3C", linetype = "dashed") +
theme_minimal() +
labs(title = "Linear Model: Predicted vs Actual",
x = "Actual Revenue", y = "Predicted Revenue") +
scale_x_continuous(labels = function(x) paste0("$", format(x/1000000, digits = 1), "M")) +
scale_y_continuous(labels = function(x) paste0("$", format(x/1000000, digits = 1), "M"))
p_rf <- ggplot(comparison_df, aes(x = Actual, y = Random_Forest)) +
geom_point(color = "#2ECC71", size = 2) +
geom_abline(intercept = 0, slope = 1, color = "#E74C3C", linetype = "dashed") +
theme_minimal() +
labs(title = "Random Forest: Predicted vs Actual",
x = "Actual Revenue", y = "Predicted Revenue") +
scale_x_continuous(labels = function(x) paste0("$", format(x/1000000, digits = 1), "M")) +
scale_y_continuous(labels = function(x) paste0("$", format(x/1000000, digits = 1), "M"))
grid.arrange(p_lm, p_rf, ncol = 2)