from IPython.display import display, Image
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import os

os.makedirs('../charts', exist_ok=True)

df = pd.read_csv('../data/screening_results.csv')

print(f'Total companies in pipeline: {len(df)}')
print()
print('Investment flag distribution:')
print(df['investment_flag'].value_counts())
print()
print('Deferred breakdown:')
print(df[df['investment_flag']=='Deferred']['deferred_reason'].value_counts())
print()
print('Submission anomalies:', df['submission_anomaly'].sum())
print('EBITDA reconciliation flags:', df['ebitda_recon_flag'].sum())
print()

# Screenable companies (reached Gate 5)
df_screen   = df[df['investment_flag'].isin(['Advance', 'Decline'])].copy()
df_deferred = df[df['investment_flag'] == 'Deferred'].copy()
df_def_mat  = df[df['deferred_reason'] == 'operating_history'].copy()
df_def_fin  = df[df['deferred_reason'] == 'financial_history'].copy()

print(f'Screened companies: {len(df_screen)}')
print(f'Deferred — operating history: {len(df_def_mat)}')
print(f'Deferred — financial history: {len(df_def_fin)}')
print()
print('Criterion score availability (screened companies):')
for col in ['rev_score', 'ebitda_score', 'growth_score', 'd2e_score']:
    n = df_screen[col].notna().sum()
    print(f'  {col}: {n}/{len(df_screen)}')

Total companies in pipeline: 100

Investment flag distribution:
investment_flag
Decline     63
Deferred    26
Advance     11
Name: count, dtype: int64

Deferred breakdown:
deferred_reason
operating_history    17
financial_history     9
Name: count, dtype: int64

Submission anomalies: 9
EBITDA reconciliation flags: 5

Screened companies: 74
Deferred — operating history: 17
Deferred — financial history: 9

Criterion score availability (screened companies):
  rev_score: 74/74
  ebitda_score: 52/74
  growth_score: 74/74
  d2e_score: 54/74

# ── Screening criteria (reference) ───────────────────────────────────────
# Scores are pre-computed in Phase 2 (extract_and_validate.py) and loaded
# from screening_results.csv. These constants are shown here for reference.

CRITERIA = {
    'revenue_usd_millions': {'advance': 1.5,  'label': 'Revenue',               'unit': '$M',  'higher': True},
    'ebitda_margin_pct':    {'advance': 15.0, 'label': 'EBITDA margin',          'unit': '%',   'higher': True},
    'revenue_growth_pct':   {'advance': 10.0, 'label': 'Revenue growth (YoY)',   'unit': '%',   'higher': True},
    'debt_to_equity':       {'advance': 2.0,  'label': 'Debt-to-equity ratio',   'unit': '',    'higher': False},
}

print('Screening criteria — all four must be met for Advance:')
for key, t in CRITERIA.items():
    direction = '≥' if t['higher'] else '≤'
    unit = t['unit']
    val = f'${t["advance"]}M' if unit == '$M' else f'{t["advance"]}{unit}'
    print(f'  {t["label"]}: {direction} {val}')
print()
print('Investment flag distribution (from Phase 2):')
print(df['investment_flag'].value_counts())

Screening criteria — all four must be met for Advance:
  Revenue: ≥ $1.5M
  EBITDA margin: ≥ 15.0%
  Revenue growth (YoY): ≥ 10.0%
  Debt-to-equity ratio: ≤ 2.0

Investment flag distribution (from Phase 2):
investment_flag
Decline     63
Deferred    26
Advance     11
Name: count, dtype: int64

matplotlib.rcParams['font.family'] = 'monospace'

BG    = '#F7F4EF'
TEXT  = '#1A1A1A'
MID   = '#888780'
BORDER= '#D3D1C7'
BASE  = '#2E3E5A'
ACCENT= '#E8621A'
LIGHT = '#D3D1C7'

n_total   = len(df)
n_def_mat = len(df_def_mat)
n_def_fin = len(df_def_fin)
n_screen  = len(df_screen)
n_advance = int(df['investment_flag'].eq('Advance').sum())
n_decline = int(df['investment_flag'].eq('Decline').sum())

stages = ['Submitted', 'After\nGate 1', 'After\nGate 3', 'Screened', 'Advance', 'Decline']
values = [n_total, n_total - n_def_mat, n_total - n_def_mat - n_def_fin, n_screen, n_advance, n_decline]
colors = [LIGHT, LIGHT, LIGHT, LIGHT, ACCENT, BASE]

fig, ax = plt.subplots(figsize=(12, 5))
fig.patch.set_facecolor(BG); ax.set_facecolor(BG)
ax.xaxis.grid(False)

bars = ax.bar(stages, values, color=colors, width=0.55, zorder=3)
for bar, val in zip(bars, values):
    pct = val / n_total * 100
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.8,
            f'{val}  ({pct:.0f}%)', ha='center', va='bottom',
            fontsize=8, color=TEXT, fontfamily='monospace', fontweight='bold')

ax.set_title('Only 11% of submissions clear all four criteria',
             fontsize=11, fontweight='bold', color=TEXT, fontfamily='monospace', pad=16)
ax.set_ylabel('Number of companies', fontsize=9, color=MID, fontfamily='monospace')
ax.tick_params(colors=MID, labelsize=8)
for spine in ax.spines.values(): spine.set_edgecolor(BORDER)
ax.yaxis.grid(True, color=BORDER, linewidth=0.5, alpha=0.5, zorder=0)
ax.spines['top'].set_visible(False); ax.spines['right'].set_visible(False)
ax.set_axisbelow(True); ax.set_ylim(0, n_total * 1.22)

plt.tight_layout()
plt.savefig('../charts/chart_1_pipeline_funnel.png', dpi=150, bbox_inches='tight', facecolor=BG)
display(Image('../charts/chart_1_pipeline_funnel.png'))
print('Saved: charts/chart_1_pipeline_funnel.png')

Saved: charts/chart_1_pipeline_funnel.png

criteria_map = {
    'Growth < 10%':         int(df_screen[df_screen['growth_score'].notna()]['growth_score'].eq('decline').sum()),
    'Revenue < $1.5M':      int(df_screen['rev_score'].eq('decline').sum()),
    'EBITDA margin < 15%':  int(df_screen[df_screen['ebitda_score'].notna()]['ebitda_score'].eq('decline').sum()),
    'Debt-to-equity > 2.0': int(df_screen[df_screen['d2e_score'].notna()]['d2e_score'].eq('decline').sum()),
}

pairs   = sorted(zip(criteria_map.values(), criteria_map.keys()), reverse=True)
vals_s, labs_s = zip(*pairs)
pcts    = [v / len(df_screen) * 100 for v in vals_s]
bar_colors = [ACCENT, ACCENT, BASE, BASE]

fig, ax = plt.subplots(figsize=(10, 4.5))
fig.patch.set_facecolor(BG); ax.set_facecolor(BG)
ax.yaxis.grid(False)
ax.xaxis.grid(True, color=BORDER, linewidth=0.5, alpha=0.5, zorder=0)

bars = ax.barh(labs_s, pcts, color=bar_colors, height=0.45, zorder=3)
for bar, val, pct in zip(bars, vals_s, pcts):
    ax.text(pct + 0.5, bar.get_y() + bar.get_height()/2,
            f'{val} companies  ({pct:.1f}%)', ha='left', va='center',
            fontsize=8, color=TEXT, fontfamily='monospace', fontweight='bold')

ax.set_title('Growth and revenue are the two most common barriers to Advance',
             fontsize=11, fontweight='bold', color=TEXT, fontfamily='monospace', pad=16)
ax.set_xlabel('% of screened companies not meeting this criterion', fontsize=9, color=MID, fontfamily='monospace')
ax.tick_params(colors=MID, labelsize=8)
for spine in ax.spines.values(): spine.set_edgecolor(BORDER)
ax.spines['top'].set_visible(False); ax.spines['right'].set_visible(False)
ax.invert_yaxis(); ax.set_xlim(0, max(pcts) * 1.4)

plt.tight_layout()
plt.savefig('../charts/chart_2_signal_failures.png', dpi=150, bbox_inches='tight', facecolor=BG)
display(Image('../charts/chart_2_signal_failures.png'))
print('Saved: charts/chart_2_signal_failures.png')

Saved: charts/chart_2_signal_failures.png

df_plot = df_screen[df_screen['ebitda_margin_pct'].notna()].copy()

REV_THRESHOLD   = 1.5
EBITDA_THRESHOLD = 15.0

fig, ax = plt.subplots(figsize=(11, 6.5))
fig.patch.set_facecolor(BG); ax.set_facecolor(BG)
ax.xaxis.grid(True, color=BORDER, linewidth=0.5, alpha=0.5, zorder=0)
ax.yaxis.grid(True, color=BORDER, linewidth=0.5, alpha=0.5, zorder=0)

decline_grp = df_plot[df_plot['investment_flag'] == 'Decline']
advance_grp = df_plot[df_plot['investment_flag'] == 'Advance']

ax.scatter(decline_grp['revenue_usd_millions'], decline_grp['ebitda_margin_pct'],
           c=LIGHT, edgecolors=MID, linewidths=0.5, label='Decline', alpha=0.8, s=45, zorder=3)
ax.scatter(advance_grp['revenue_usd_millions'], advance_grp['ebitda_margin_pct'],
           c=ACCENT, edgecolors='none', label='Advance', alpha=0.9, s=65, zorder=4)

ax.axvline(x=REV_THRESHOLD,    color=MID, lw=0.8, ls='--', alpha=0.6, zorder=2)
ax.axhline(y=EBITDA_THRESHOLD, color=MID, lw=0.8, ls='--', alpha=0.6, zorder=2)

ymin = ax.get_ylim()[0]
ax.text(REV_THRESHOLD + 0.1, ymin + 1.5, '$1.5M threshold', fontsize=6.5, color=MID, fontfamily='monospace')
ax.text(0.05, EBITDA_THRESHOLD + 0.5, '15% threshold',    fontsize=6.5, color=MID, fontfamily='monospace')

ax.set_title('Advance companies clear both revenue and margin — the gap is structural',
             fontsize=11, fontweight='bold', color=TEXT, fontfamily='monospace', pad=16)
ax.set_xlabel('Revenue (USD millions)', fontsize=9, color=MID, fontfamily='monospace')
ax.set_ylabel('EBITDA margin (%)',       fontsize=9, color=MID, fontfamily='monospace')
ax.tick_params(colors=MID, labelsize=8)
for spine in ax.spines.values(): spine.set_edgecolor(BORDER)
ax.spines['top'].set_visible(False); ax.spines['right'].set_visible(False)
ax.set_axisbelow(True)
ax.legend(fontsize=8, framealpha=0, labelcolor=TEXT, loc='upper center', bbox_to_anchor=(0.5, -0.12), ncol=2)

plt.tight_layout()
plt.savefig('../charts/chart_3_ebitda_vs_revenue.png', dpi=150, bbox_inches='tight', facecolor=BG)
display(Image('../charts/chart_3_ebitda_vs_revenue.png'))
print('Saved: charts/chart_3_ebitda_vs_revenue.png')

Saved: charts/chart_3_ebitda_vs_revenue.png

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5.5))
fig.patch.set_facecolor(BG)

# Panel 1 — Outcome distribution
ax1.set_facecolor(BG)
outcome_labels = ['Advance', 'Decline', 'Deferred\n(maturity)', 'Deferred\n(history)']
outcome_counts = [n_advance, n_decline, len(df_def_mat), len(df_def_fin)]
outcome_colors = [ACCENT, BASE, LIGHT, LIGHT]

bars1 = ax1.bar(outcome_labels, outcome_counts, color=outcome_colors, width=0.5, zorder=3)
for bar, val in zip(bars1, outcome_counts):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3,
             f'{val} ({val / n_total * 100:.0f}%)', ha='center', va='bottom',
             fontsize=8, color=TEXT, fontfamily='monospace', fontweight='bold')
ax1.set_title('63 companies decline — 26 deferred before screening', fontsize=10,
              fontweight='bold', color=TEXT, fontfamily='monospace', pad=12)
ax1.set_ylabel('Number of companies', fontsize=8, color=MID, fontfamily='monospace')
ax1.tick_params(colors=MID, labelsize=8)
for spine in ax1.spines.values(): spine.set_edgecolor(BORDER)
ax1.spines['top'].set_visible(False); ax1.spines['right'].set_visible(False)
ax1.yaxis.grid(True, color=BORDER, linewidth=0.5, alpha=0.5, zorder=0)
ax1.set_axisbelow(True); ax1.set_ylim(0, max(outcome_counts) * 1.3)

# Panel 2 — Completeness score distribution
ax2.set_facecolor(BG)
completeness = df_screen['completeness_score'].dropna()
bins         = [0, 30, 50, 70, 85, 100]
bin_labels   = ['0–30', '31–50', '51–70', '71–85', '86–100']
bin_colors   = [BASE, BASE, BASE, BASE, ACCENT]
counts2, _   = np.histogram(completeness, bins=bins)

bars2 = ax2.bar(bin_labels, counts2, color=bin_colors, width=0.5, zorder=3)
for bar, val in zip(bars2, counts2):
    if val > 0:
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.2,
                 str(val), ha='center', va='bottom',
                 fontsize=9, color=TEXT, fontfamily='monospace', fontweight='bold')
ax2.set_title('Most screened submissions score above 85% completeness', fontsize=10,
              fontweight='bold', color=TEXT, fontfamily='monospace', pad=12)
ax2.set_xlabel('Completeness score', fontsize=8, color=MID, fontfamily='monospace')
ax2.set_ylabel('Number of companies', fontsize=8, color=MID, fontfamily='monospace')
ax2.tick_params(colors=MID, labelsize=8)
for spine in ax2.spines.values(): spine.set_edgecolor(BORDER)
ax2.spines['top'].set_visible(False); ax2.spines['right'].set_visible(False)
ax2.yaxis.grid(True, color=BORDER, linewidth=0.5, alpha=0.5, zorder=0)
ax2.set_axisbelow(True)

plt.tight_layout()
plt.savefig('../charts/chart_4_pipeline_health.png', dpi=150, bbox_inches='tight', facecolor=BG)
display(Image('../charts/chart_4_pipeline_health.png'))
print('Saved: charts/chart_4_pipeline_health.png')

Saved: charts/chart_4_pipeline_health.png

print('Investment flags by sector (screened companies):')
sector_bd = pd.crosstab(df_screen['sector'], df_screen['investment_flag'])
sector_bd['total'] = sector_bd.sum(axis=1)
if 'Advance' in sector_bd.columns:
    sector_bd['advance_rate_%'] = (sector_bd['Advance'] / sector_bd['total'] * 100).round(1)
print(sector_bd.to_string())
print()

print('Investment flags by country (screened companies):')
country_bd = pd.crosstab(df_screen['country'], df_screen['investment_flag'])
country_bd['total'] = country_bd.sum(axis=1)
if 'Advance' in country_bd.columns:
    country_bd['advance_rate_%'] = (country_bd['Advance'] / country_bd['total'] * 100).round(1)
print(country_bd.to_string())

Investment flags by sector (screened companies):
investment_flag  Advance  Decline  total  advance_rate_%
sector                                                  
Agribusiness           2       10     12            16.7
Energy                 2       10     12            16.7
Fintech                0        9      9             0.0
Healthcare             1       12     13             7.7
Logistics              2        6      8            25.0
Manufacturing          2        5      7            28.6
Retail                 2       11     13            15.4

Investment flags by country (screened companies):
investment_flag  Advance  Decline  total  advance_rate_%
country                                                 
Egypt                  1        9     10            10.0
Ethiopia               0        9      9             0.0
Ghana                  1        7      8            12.5
Kenya                  1        5      6            16.7
Nigeria                1        5      6            16.7
Rwanda                 3        4      7            42.9
Senegal                0        6      6             0.0
South Africa           3        6      9            33.3
Tanzania               1        5      6            16.7
Uganda                 0        7      7             0.0

# Final results are already in the CSV from Phase 2.
# This cell prints the summary and confirms the output path.

print('FINAL RESULTS — ALL 100 COMPANIES')
print('=' * 60)
print(df['investment_flag'].value_counts().to_string())
print()
print('Advance companies:')
advance = df[df['investment_flag'] == 'Advance'][[
    'submission_id', 'company_name', 'country', 'sector',
    'revenue_usd_millions', 'ebitda_margin_pct', 'revenue_growth_pct', 'debt_to_equity'
]].sort_values('revenue_usd_millions', ascending=False)
print(advance.to_string(index=False))
print()
print('Submission anomalies (internal — not in founder reports):')
anom = df[df['submission_anomaly'] == True][[
    'submission_id', 'company_name', 'audit_status', 'investment_flag'
]]
print(anom.to_string(index=False))
print()
print('Source: data/screening_results.csv (Phase 2 output)')

FINAL RESULTS — ALL 100 COMPANIES
============================================================
investment_flag
Decline     63
Deferred    26
Advance     11

Advance companies:
submission_id           company_name      country        sector  revenue_usd_millions  ebitda_margin_pct  revenue_growth_pct  debt_to_equity
        AF055     Waterberg Holdings       Rwanda  Agribusiness                13.059              27.44                11.5            1.05
        AF052      Thorn Tree Energy South Africa     Logistics                 4.010              20.61                13.7            1.56
        AF040    Flame Tree Ventures       Rwanda Manufacturing                 3.509              21.62                10.6            1.57
        AF026     River Basin Retail        Egypt        Retail                 3.391              25.39                19.7            1.56
        AF010 Horizon Healthcare Ltd      Nigeria     Logistics                 3.135              18.04                15.8            0.73
        AF036      Boma Retail Chain        Kenya        Energy                 2.718              19.71                13.9            1.07
        AF053         Ubuntu Agri Co South Africa Manufacturing                 2.443              27.43                15.9            1.50
        AF057    Yellow River Retail     Tanzania        Energy                 2.129              15.96                10.3            0.34
        AF050   Prism Capital Africa       Rwanda    Healthcare                 2.102              20.71                18.0            1.38
        AF031       Wangari Agri Ltd South Africa  Agribusiness                 2.021              28.74                12.6            0.92
        AF016       Harambee Capital        Ghana        Retail                 1.955              31.25                11.6            0.68

Submission anomalies (internal — not in founder reports):
submission_id            company_name        audit_status investment_flag
        AF003  Meridian Agri Holdings Management Accounts        Deferred
        AF008  Serengeti Retail Group           Unaudited         Decline
        AF018  Jua Kali Manufacturing           Unaudited         Decline
        AF019    Kilimanjaro Holdings Management Accounts         Decline
        AF020     Lakeshore Financial Management Accounts         Decline
        AF024      Pantheon Logistics Management Accounts         Decline
        AF025 Quartz Energy Solutions Management Accounts         Decline
        AF029  Ubuntu Financial Group           Unaudited         Decline
        AF032   Xcel Logistics Africa Management Accounts         Decline

Source: data/screening_results.csv (Phase 2 output)

Phase	File	Responsibility
1 — Ingestion	`generate_profiles.py`	Generates 100 structured JSON company submission forms
2 — Extraction	`pipeline/extract_and_validate.py`	Runs five-gate validation, derives screening metrics, outputs master CSV
3 — Screening	This notebook	Applies investment thresholds, produces flags and charts

WB Indicator file	What it measures	How it was used
`WB_ES_T_PERF1.csv`	Real annual sales growth (%)	Calibrates growth ranges per archetype. Median: 0.2%, P25: -10.2%, P75: 5.6%
`WB_ES_T_FIN16.csv`	% of firms citing finance as major constraint	Calibrates submission completeness split. Median: 33.1% → 50/33/17 Validated/Partial/Deferred

Gate	Condition	Action	Rationale
Gate 1	`reporting_year − year_founded < 3`	Deferred — exit pipeline	EBITDA from under 3 years of operations is not a reliable signal
Gate 1	Critical fields null (revenue, gross_profit, total_equity)	Deferred — exit pipeline	Can't screen on partial critical data
Gate 2	EBITDA reconciliation variance > 5%	Flag — continue to screening	Submitted EBITDA checked against derived (GP − opex)
Gate 3	All others	Derive metrics, compile CSV	USD conversion, EBITDA margin, YoY growth, D/E ratio

Decision	Rationale
Minimum 3 years operating history (Gate 1)	EBITDA from fewer than 3 years of operations is not a reliable signal — it could reflect a launch spike rather than sustainable margin. Deferred, not declined.
Completeness gate before screening (Gate 1)	Scoring a company on 1 signal produces a misleading flag. Decline is a different operational action from Decline — it triggers a follow-up request, not a rejection.
EBITDA reconciliation check (Gate 4)	Submitted EBITDA is cross-checked against derived (gross profit − opex). A >5% variance flags the submission for reviewer check before the figure drives a decision. The company is not removed from the pipeline.
Prior year revenue as submitted field	The system calculates YoY growth from the two revenue figures rather than accepting a submitted growth rate. This separates raw data from derived metric and makes the calculation auditable.
Three-tier flag (Advance/Decline/Decline)	Collapse to binary pass/fail loses borderline companies that warrant a conditional conversation. Decline is a different operational action from Decline.
All 4 criteria required for Advance	A missing signal could be concealing a failure. Full visibility required for the highest confidence flag.
Signal failure chart sorted by frequency	The sort order carries meaning — the chart is designed to answer 'where is the pipeline breaking down most?'
Synthetic data (calibrated to WB indicators)	No publicly available dataset provides African SME-level financials at this granularity without confidentiality restrictions. WB microdata requires a signed confidentiality agreement. The indicator-level data used here does not.

Investment Readiness Screener — Phase 3: Screening & Analysis¶

Project context¶

Pipeline architecture¶

Source data and calibration¶

Gate logic (applied in Phase 2)¶

Screening thresholds¶

Section 0 — Setup and load¶

Section 1 — Apply screening thresholds¶

Section 2 — Chart 1: Pipeline conversion funnel¶

Section 3 — Chart 2: Signal failure breakdown¶

Section 4 — Chart 3: EBITDA margin vs revenue (coloured by flag)¶

Section 5 — Chart 4: Pipeline health dashboard¶

Section 6 — Sector and country breakdown¶

Section 7 — Full results export¶

Summary of all analytical decisions¶

World Bank Enterprise Survey calibration — full citation¶