# -*- coding: utf-8 -*-
"""5_nust_mathematicalmethodsofcomputing_deepseekR1confidenceinterval.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1CQXs65C9twkyFaVZ2SLW3pwL7QciPqN8
"""

import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# Benchmark scores for DeepSeek R1
benchmarks = {
    "MATH-500": 97.3,
    "Codeforces": 96.3,
    "AIME 2024": 79.8,
    "SWE-bench Verified": 49.2,
    "GPQA Diamond": 71.5,
    "MMLU": 90.8
}

# Extract values
scores = list(benchmarks.values())

# Fit a normal distribution
mean = np.mean(scores)
std = np.std(scores)

# Generate x values
x = np.linspace(mean - 4*std, mean + 4*std, 1000)
pdf = stats.norm.pdf(x, mean, std)

# Plot
plt.figure(figsize=(12, 6))
plt.plot(x, pdf, label=f'Normal Distribution\nMean = {mean:.2f}, Std = {std:.2f}', color='blue')
plt.scatter(scores, stats.norm.pdf(scores, mean, std), color='red', zorder=5, label='Benchmark Scores')

offsets = {
    "MATH-500": -20,  # Shift this one down more to avoid overlap
    "Codeforces": 10,
    "AIME 2024": 5,
    "SWE-bench Verified": 10,
    "GPQA Diamond": 10,
    "MMLU": 10
}
for label, value in benchmarks.items():
    plt.annotate(label, (value, stats.norm.pdf(value, mean, std)), textcoords="offset points", xytext=(0,offsets[label]), ha='center')

plt.title('Normal Distribution of DeepSeek R1 Benchmark Scores')
plt.xlabel('Score (%)')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.show()

# DeepSeek R-1 benchmark scores
scores = [97.3, 96.3, 79.8, 49.2, 71.5, 90.8]

# Step 1: Calculate mean, standard deviation, standard error
mean = np.mean(scores)
std_dev = np.std(scores, ddof=1)
n = len(scores)
standard_error = std_dev / np.sqrt(n)

# Step 2: t-value for 95% confidence interval (two-tailed)
confidence_level = 0.95
alpha = 1 - confidence_level
df = n - 1
t_value = stats.t.ppf(1 - alpha/2, df)

# Step 3: Margin of error
margin_of_error = t_value * standard_error

# Step 4: Confidence interval
ci_lower = mean - margin_of_error
ci_upper = mean + margin_of_error

# Step 5: Plotting
x = np.linspace(mean - 4*std_dev, mean + 4*std_dev, 1000)
y = stats.norm.pdf(x, mean, std_dev)

plt.figure(figsize=(12, 6))
plt.plot(x, y, label='Normal Distribution', color='skyblue')
plt.axvline(mean, color='blue', linestyle='--', label=f'Mean = {mean:.2f}')
plt.axvline(ci_lower, color='green', linestyle='--', label=f'95% CI Lower = {ci_lower:.2f}')
plt.axvline(ci_upper, color='green', linestyle='--', label=f'95% CI Upper = {ci_upper:.2f}')

# Shaded area for CI
x_fill = np.linspace(ci_lower, ci_upper, 1000)
y_fill = stats.norm.pdf(x_fill, mean, std_dev)
plt.fill_between(x_fill, y_fill, color='lightgreen', alpha=0.5, label='95% Confidence Interval')

# Scatter the raw scores
plt.scatter(scores, stats.norm.pdf(scores, mean, std_dev), color='red', label='Benchmark Scores', zorder=5)

plt.title('DeepSeek R-1 Benchmark Scores with 95% Confidence Interval')
plt.xlabel('Score (%)')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Print statistical summary
print(f"Mean: {mean:.2f}")
print(f"Standard Error: {standard_error:.2f}")
print(f"t-value (95% CI, df={df}): {t_value:.3f}")
print(f"Margin of Error: {margin_of_error:.2f}")
print(f"95% Confidence Interval: ({ci_lower:.2f}, {ci_upper:.2f})")

# Benchmark scores for OpenAI o1-1217
benchmarks = {
    "MATH-500": 96.4,
    "Codeforces": 96.6,
    "AIME 2024": 79.2,
    "SWE-bench Verified": 48.9,
    "GPQA Diamond": 75.7,
    "MMLU": 91.8
}

# Extract values
scores = list(benchmarks.values())

# Fit a normal distribution
mean = np.mean(scores)
std = np.std(scores)

# Generate x values
x = np.linspace(mean - 4*std, mean + 4*std, 1000)
pdf = stats.norm.pdf(x, mean, std)

# Plot
plt.figure(figsize=(12, 6))
plt.plot(x, pdf, label=f'Normal Distribution\nMean = {mean:.2f}, Std = {std:.2f}', color='blue')
plt.scatter(scores, stats.norm.pdf(scores, mean, std), color='red', zorder=5, label='Benchmark Scores')

offsets = {
    "MATH-500": -20,  # Shift this one down more to avoid overlap
    "Codeforces": 10,
    "AIME 2024": 10,
    "SWE-bench Verified": 10,
    "GPQA Diamond": -20, # Shift this one down more to avoid overlap
    "MMLU": 10
}

for label, value in benchmarks.items():
    plt.annotate(label, (value, stats.norm.pdf(value, mean, std)), textcoords="offset points", xytext=(0,offsets[label]), ha='center')

plt.title('Normal Distribution of OpenAI o1-1217 Benchmark Scores')
plt.xlabel('Score (%)')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.show()

# OpenAI o1-1217 benchmark scores
scores = [96.4, 96.6, 79.2, 48.9, 75.7, 91.8]

# Step 1: Calculate mean, standard deviation, standard error
mean = np.mean(scores)
std_dev = np.std(scores, ddof=1)
n = len(scores)
standard_error = std_dev / np.sqrt(n)

# Step 2: t-value for 95% confidence interval (two-tailed)
confidence_level = 0.95
alpha = 1 - confidence_level
df = n - 1
t_value = stats.t.ppf(1 - alpha/2, df)

# Step 3: Margin of error
margin_of_error = t_value * standard_error

# Step 4: Confidence interval
ci_lower = mean - margin_of_error
ci_upper = mean + margin_of_error

# Step 5: Plotting
x = np.linspace(mean - 4*std_dev, mean + 4*std_dev, 1000)
y = stats.norm.pdf(x, mean, std_dev)

plt.figure(figsize=(12, 6))
plt.plot(x, y, label='Normal Distribution', color='skyblue')
plt.axvline(mean, color='blue', linestyle='--', label=f'Mean = {mean:.2f}')
plt.axvline(ci_lower, color='green', linestyle='--', label=f'95% CI Lower = {ci_lower:.2f}')
plt.axvline(ci_upper, color='green', linestyle='--', label=f'95% CI Upper = {ci_upper:.2f}')

# Shaded area for CI
x_fill = np.linspace(ci_lower, ci_upper, 1000)
y_fill = stats.norm.pdf(x_fill, mean, std_dev)
plt.fill_between(x_fill, y_fill, color='lightgreen', alpha=0.5, label='95% Confidence Interval')

# Scatter the raw scores
plt.scatter(scores, stats.norm.pdf(scores, mean, std_dev), color='red', label='Benchmark Scores', zorder=5)

plt.title('OpenAI o1-1217 Benchmark Scores with 95% Confidence Interval')
plt.xlabel('Score (%)')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Print statistical summary
print(f"Mean: {mean:.2f}")
print(f"Standard Error: {standard_error:.2f}")
print(f"t-value (95% CI, df={df}): {t_value:.3f}")
print(f"Margin of Error: {margin_of_error:.2f}")
print(f"95% Confidence Interval: ({ci_lower:.2f}, {ci_upper:.2f})")

# Benchmark scores for DeepSeek R1-32B
benchmarks = {
    "MATH-500": 94.3,
    "Codeforces": 90.6,
    "AIME 2024": 72.6,
    "SWE-bench Verified": 36.8,
    "GPQA Diamond": 62.1,
    "MMLU": 87.4
}

# Extract values
scores = list(benchmarks.values())

# Fit a normal distribution
mean = np.mean(scores)
std = np.std(scores)

# Generate x values
x = np.linspace(mean - 4*std_dev, mean + 4*std_dev, 1000)
pdf = stats.norm.pdf(x, mean, std)

# Plot
plt.figure(figsize=(12, 6))
plt.plot(x, pdf, label=f'Normal Distribution\nMean = {mean:.2f}, Std = {std:.2f}', color='blue')
plt.scatter(scores, stats.norm.pdf(scores, mean, std), color='red', zorder=5, label='Benchmark Scores')

offsets = {
    "MATH-500": 10,
    "Codeforces": 10,
    "AIME 2024": 10,
    "SWE-bench Verified": 10,
    "GPQA Diamond": 10,
    "MMLU": 10
}
for label, value in benchmarks.items():
    plt.annotate(label, (value, stats.norm.pdf(value, mean, std)), textcoords="offset points", xytext=(0,offsets[label]), ha='center')

plt.title('Normal Distribution of DeepSeek R1-32B Benchmark Scores')
plt.xlabel('Score (%)')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.show()

# DeepSeek R1-32B benchmark scores
scores = [94.3, 90.6, 72.6, 36.8, 62.1, 87.4]

# Step 1: Calculate mean, standard deviation, standard error
mean = np.mean(scores)
std_dev = np.std(scores, ddof=1)
n = len(scores)
standard_error = std_dev / np.sqrt(n)

# Step 2: t-value for 95% confidence interval (two-tailed)
confidence_level = 0.95
alpha = 1 - confidence_level
df = n - 1
t_value = stats.t.ppf(1 - alpha/2, df)

# Step 3: Margin of error
margin_of_error = t_value * standard_error

# Step 4: Confidence interval
ci_lower = mean - margin_of_error
ci_upper = mean + margin_of_error

# Step 5: Plotting
x = np.linspace(mean - 4*std_dev, mean + 4*std_dev, 1000)
y = stats.norm.pdf(x, mean, std_dev)

plt.figure(figsize=(12, 6))
plt.plot(x, y, label='Normal Distribution', color='skyblue')
plt.axvline(mean, color='blue', linestyle='--', label=f'Mean = {mean:.2f}')
plt.axvline(ci_lower, color='green', linestyle='--', label=f'95% CI Lower = {ci_lower:.2f}')
plt.axvline(ci_upper, color='green', linestyle='--', label=f'95% CI Upper = {ci_upper:.2f}')

# Shaded area for CI
x_fill = np.linspace(ci_lower, ci_upper, 1000)
y_fill = stats.norm.pdf(x_fill, mean, std_dev)
plt.fill_between(x_fill, y_fill, color='lightgreen', alpha=0.5, label='95% Confidence Interval')

# Scatter the raw scores
plt.scatter(scores, stats.norm.pdf(scores, mean, std_dev), color='red', label='Benchmark Scores', zorder=5)

plt.title('DeepSeek R1-32B Benchmark Scores with 95% Confidence Interval')
plt.xlabel('Score (%)')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Print statistical summary
print(f"Mean: {mean:.2f}")
print(f"Standard Error: {standard_error:.2f}")
print(f"t-value (95% CI, df={df}): {t_value:.3f}")
print(f"Margin of Error: {margin_of_error:.2f}")
print(f"95% Confidence Interval: ({ci_lower:.2f}, {ci_upper:.2f})")

# Benchmark scores for OpenAI o1-mini
benchmarks = {
    "MATH-500": 90.0,
    "Codeforces": 93.4,
    "AIME 2024": 63.6,
    "SWE-bench Verified": 41.6,
    "GPQA Diamond": 62.1,
    "MMLU": 87.4
}

# Extract values
scores = list(benchmarks.values())

# Fit a normal distribution
mean = np.mean(scores)
std = np.std(scores)

# Generate x values
x = np.linspace(mean - 4*std_dev, mean + 4*std_dev, 1000)
pdf = stats.norm.pdf(x, mean, std)

# Plot
plt.figure(figsize=(12, 6))
plt.plot(x, pdf, label=f'Normal Distribution\nMean = {mean:.2f}, Std = {std:.2f}', color='blue')
plt.scatter(scores, stats.norm.pdf(scores, mean, std), color='red', zorder=5, label='Benchmark Scores')

offsets = {
    "MATH-500": 10,
    "Codeforces": 10,
    "AIME 2024": 10,
    "SWE-bench Verified": 10,
    "GPQA Diamond": -20, # Shift this down more to avoid overlap
    "MMLU": 10
}
for label, value in benchmarks.items():
    plt.annotate(label, (value, stats.norm.pdf(value, mean, std)), textcoords="offset points", xytext=(0,offsets[label]), ha='center')

plt.title('Normal Distribution of OpenAI o1-mini Benchmark Scores')
plt.xlabel('Score (%)')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.show()

# OpenAI o1-mini benchmark scores
scores = [90.0, 93.4, 63.6, 41.6, 62.1, 87.4]

# Step 1: Calculate mean, standard deviation, standard error
mean = np.mean(scores)
std_dev = np.std(scores, ddof=1)
n = len(scores)
standard_error = std_dev / np.sqrt(n)

# Step 2: t-value for 95% confidence interval (two-tailed)
confidence_level = 0.95
alpha = 1 - confidence_level
df = n - 1
t_value = stats.t.ppf(1 - alpha/2, df)

# Step 3: Margin of error
margin_of_error = t_value * standard_error

# Step 4: Confidence interval
ci_lower = mean - margin_of_error
ci_upper = mean + margin_of_error

# Step 5: Plotting
x = np.linspace(mean - 4*std_dev, mean + 4*std_dev, 1000)
y = stats.norm.pdf(x, mean, std_dev)

plt.figure(figsize=(12, 6))
plt.plot(x, y, label='Normal Distribution', color='skyblue')
plt.axvline(mean, color='blue', linestyle='--', label=f'Mean = {mean:.2f}')
plt.axvline(ci_lower, color='green', linestyle='--', label=f'95% CI Lower = {ci_lower:.2f}')
plt.axvline(ci_upper, color='green', linestyle='--', label=f'95% CI Upper = {ci_upper:.2f}')

# Shaded area for CI
x_fill = np.linspace(ci_lower, ci_upper, 1000)
y_fill = stats.norm.pdf(x_fill, mean, std_dev)
plt.fill_between(x_fill, y_fill, color='lightgreen', alpha=0.5, label='95% Confidence Interval')

# Scatter the raw scores
plt.scatter(scores, stats.norm.pdf(scores, mean, std_dev), color='red', label='Benchmark Scores', zorder=5)

plt.title('OpenAI o1-mini Benchmark Scores with 95% Confidence Interval')
plt.xlabel('Score (%)')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Print statistical summary
print(f"Mean: {mean:.2f}")
print(f"Standard Error: {standard_error:.2f}")
print(f"t-value (95% CI, df={df}): {t_value:.3f}")
print(f"Margin of Error: {margin_of_error:.2f}")
print(f"95% Confidence Interval: ({ci_lower:.2f}, {ci_upper:.2f})")

# Benchmark scores for DeepSeek V3
benchmarks = {
    "MATH-500": 90.2,
    "Codeforces": 58.7,
    "AIME 2024": 39.2,
    "SWE-bench Verified": 42.0,
    "GPQA Diamond": 59.1,
    "MMLU": 88.5
}

# Extract values
scores = list(benchmarks.values())

# Fit a normal distribution
mean = np.mean(scores)
std = np.std(scores)

# Generate x values
x = np.linspace(mean - 4*std_dev, mean + 4*std_dev, 1000)
pdf = stats.norm.pdf(x, mean, std)

# Plot
plt.figure(figsize=(12, 6))
plt.plot(x, pdf, label=f'Normal Distribution\nMean = {mean:.2f}, Std = {std:.2f}', color='blue')
plt.scatter(scores, stats.norm.pdf(scores, mean, std), color='red', zorder=5, label='Benchmark Scores')

offsets = {
    "MATH-500": -20,
    "Codeforces": -20,
    "AIME 2024": 10,
    "SWE-bench Verified": 10,
    "GPQA Diamond": 10, # Shift this down up more to avoid overlap
    "MMLU": 10
}
for label, value in benchmarks.items():
    plt.annotate(label, (value, stats.norm.pdf(value, mean, std)), textcoords="offset points", xytext=(0, offsets[label]), ha='center')

plt.title('Normal Distribution of DeepSeek V3 Benchmark Scores')
plt.xlabel('Score (%)')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.show()

# DeepSeek V3 benchmark scores
scores = [90.2, 58.7, 39.2, 42.0, 59.1, 88.5]

# Step 1: Calculate mean, standard deviation, standard error
mean = np.mean(scores)
std_dev = np.std(scores, ddof=1)
n = len(scores)
standard_error = std_dev / np.sqrt(n)

# Step 2: t-value for 95% confidence interval (two-tailed)
confidence_level = 0.95
alpha = 1 - confidence_level
df = n - 1
t_value = stats.t.ppf(1 - alpha/2, df)

# Step 3: Margin of error
margin_of_error = t_value * standard_error

# Step 4: Confidence interval
ci_lower = mean - margin_of_error
ci_upper = mean + margin_of_error

# Step 5: Plotting
x = np.linspace(mean - 4*std_dev, mean + 4*std_dev, 1000)
y = stats.norm.pdf(x, mean, std_dev)

plt.figure(figsize=(12, 6))
plt.plot(x, y, label='Normal Distribution', color='skyblue')
plt.axvline(mean, color='blue', linestyle='--', label=f'Mean = {mean:.2f}')
plt.axvline(ci_lower, color='green', linestyle='--', label=f'95% CI Lower = {ci_lower:.2f}')
plt.axvline(ci_upper, color='green', linestyle='--', label=f'95% CI Upper = {ci_upper:.2f}')

# Shaded area for CI
x_fill = np.linspace(ci_lower, ci_upper, 1000)
y_fill = stats.norm.pdf(x_fill, mean, std_dev)
plt.fill_between(x_fill, y_fill, color='lightgreen', alpha=0.5, label='95% Confidence Interval')

# Scatter the raw scores
plt.scatter(scores, stats.norm.pdf(scores, mean, std_dev), color='red', label='Benchmark Scores', zorder=5)

plt.title('DeepSeek V3 Benchmark Scores with 95% Confidence Interval')
plt.xlabel('Score (%)')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Print statistical summary
print(f"Mean: {mean:.2f}")
print(f"Standard Error: {standard_error:.2f}")
print(f"t-value (95% CI, df={df}): {t_value:.3f}")
print(f"Margin of Error: {margin_of_error:.2f}")
print(f"95% Confidence Interval: ({ci_lower:.2f}, {ci_upper:.2f})")