Use Fuzzing Book Resources¶
Clone the Fuzzing Book repository and put this notebook under the directory fuzzingbook/notebooks/
. Then you can use the fuzzing book resources in this notebook.
import bookutils
from typing import List, Tuple, Dict, Any
from Fuzzer import RandomFuzzer
from html.parser import HTMLParser
from Coverage import Coverage
import pickle
import hashlib
# Create simple fuzzer
fuzzer = RandomFuzzer(
min_length=1, max_length=100, char_start=32, char_range=94
)
# Create simple program-under-test
def my_parser(inp: str) -> None:
parser = HTMLParser()
parser.feed(inp)
def getTraceHash(cov: Coverage) -> str:
pickledCov = pickle.dumps(cov.coverage())
hashedCov = hashlib.md5(pickledCov).hexdigest()
return hashedCov
# Every program path is a color
inp = "<html>"
with Coverage() as cov:
my_parser(inp)
getTraceHash(cov)
'f9e386d0a19f8b21a36a63a167c434dc'
The rarity differs for different coverage¶
Use the fuzzer to create a population¶
trials = 50000
population = []
for i in range(trials):
population.append(fuzzer.fuzz())
population[:3]
['"N&+slk%hyp5o\'@[3(rW*M5W]tMFPU4\\P@tz%[X?uo\\1?b4T;1bDeYtHx #UJ5w}pMmPodJM,_%%', 'CdYN6*g|Y*Ou9I<P94}7,99ivb(9`=%jJj*', "*dOLXk!;Jw!iOU8]hqg00?u(c);>:\\=V<ZV1=*g#UJA'No5QZ)--[})Sdv"]
Record the frequency of each coverage¶
# TODO: Implement the code to calculate the coverage of the population
############# IMPLEMENT HERE #############
all_coverage_dict = {}
for inp in population:
with Coverage() as cov:
try:
my_parser(inp)
except BaseException:
pass
cov_hash = getTraceHash(cov)
if cov_hash not in all_coverage_dict:
all_coverage_dict[cov_hash] = 0
all_coverage_dict[cov_hash] += 1
##########################################
Distribution of the coverage¶
import matplotlib.pyplot as plt
import seaborn as sns
num_unique = len(all_coverage_dict)
print(f"Number of unique traces: {num_unique}")
print(
f"Top 3 frequencies: {sorted(list(all_coverage_dict.values()), reverse=True)[:3]}"
)
print(f"Bottom 3 frequencies: {sorted(list(all_coverage_dict.values()))[:3]}")
freqs = sorted(list(all_coverage_dict.values()), reverse=True)
# print bar chart of the frequencies
fig, ax = plt.subplots()
sns.barplot(x=list(range(len(freqs))), y=freqs, ax=ax)
# remove the x labels
ax.set_xticks([])
# set log-scale for y-axis
ax.set_yscale("log")
plt.show()
Number of unique traces: 511 Top 3 frequencies: [20363, 5504, 5342] Bottom 3 frequencies: [1, 1, 1]
Some Definitions¶
- Singletons: colors that appear only once in the sample
- Doubletons: colors that appear twice in the sample
$$ \Phi_k = \text{the number of colors that appear $k$ times in the sample} $$
- $\Phi_1$ = the number of singletons, $\Phi_2$ = the number of doubletons, etc.
# TODO: Compute the singletons and doubletons
############# IMPLEMENT HERE #############
singletons = {k: v for k, v in all_coverage_dict.items() if v == 1}
doubletons = {k: v for k, v in all_coverage_dict.items() if v == 2}
##########################################
print(
"The number of coverage elements seen exactly once is "
+ str(len(singletons))
)
print(
"The number of coverage elements seen exactly twice is "
+ str(len(doubletons))
)
The number of coverage elements seen exactly once is 216 The number of coverage elements seen exactly twice is 77
💡 Go back to the slides
Missing Mass: What is the probability that the next generated input increases coverage?¶
Estimating the propobability of generating a coverage-increasing input¶
- Good-Turing estimator: $$ \hat{M_0} = \frac{\Phi_1}{n} $$
# TODO: Implement the Good-Turing estimator
############# IMPLEMENT HERE #############
estimate = len(singletons) / trials
##########################################
print(
"The probability that the next input increases coverage is estimated as "
+ str(estimate)
)
The probability that the next input increases coverage is estimated as 0.00432
How do we evaluate this?¶
# TODO: Evaluate the estimator empirically
############# IMPLEMENT HERE #############
curr_coverage = all_coverage_dict.keys()
count = 0
validation = []
for i in range(trials): # sample the same number of inputs as in the population
validation.append(fuzzer.fuzz())
for inp in validation:
with Coverage() as cov:
try:
my_parser(inp)
except BaseException:
pass
this_coverage = set([getTraceHash(cov)])
if len(curr_coverage & this_coverage) == 0:
count = count + 1
empirical = count / trials
##########################################
print(
"The probability that the next input increases coverage is empirically "
+ str(empirical)
)
The probability that the next input increases coverage is empirically 0.00414
Why does Good-Turing work so well?¶
Intuitive explanation: "if you observe a new thing at this round, it becomes one of the singletons at the next round."
💡 Go back to the slides for a little bit deeper explanation :)
Species Richness: What is the maximum coverage we can achieve?¶
The estimate of the lower bound of the remaining coverage is $$ \frac{n - 1}{n}\frac{(\Phi_1)^2}{2\Phi_2} $$
# TODO: Estimate the Species Richness
############# IMPLEMENT HERE #############
estimate = int(
(trials - 1) / trials * (len(singletons) ** 2) / (2 * len(doubletons))
)
##########################################
print(
f"The lower bound of the number of coverages that are still unexplored is {estimate}"
)
The lower bound of the number of coverages that are still unexplored is 302
How do we evaluate this?¶
Let's try to run the fuzzer much longer and see if we can reach the estimate.
# TODO: Evaluate the estimator by empirically observing
# the number of new coverages for a large number of inputs
############# IMPLEMENT HERE #############
extra_trials = trials * 2
validation = []
for i in range(extra_trials):
validation.append(fuzzer.fuzz())
extra_coverage = set()
for idx, inp in enumerate(validation, 1):
if idx % trials == 0:
print(f"Processed {idx} inputs / {extra_trials}", end="\r", flush=True)
with Coverage() as cov:
try:
my_parser(inp)
except BaseException:
pass
cov_hash = set([getTraceHash(cov)])
extra_coverage |= cov_hash
print()
extra_coverage -= curr_coverage
##########################################
print(f"Number of new coverages: {len(extra_coverage)}")
Processed 100000 inputs / 100000 Number of new coverages: 275
💡 Go back to the slides
Extrapolation: How much can I discover more when I spend $X$ more time here?¶
- $\Delta(m)$: the number of new discoveries when $m$ more samples are retrieved.
$$ \hat \Delta(m) = \hat \Phi_0 \left[1 - \left(1 - \frac{\Phi_1}{n\hat \Phi_0 + \Phi_1} \right)^m\right] $$
Show the coverage increase with the number of samples.¶
- Compute the cumulative coverage of the population with size 200,000 + (additional) 200,000
- Extrapolate the number of unique coverage from the point where we have 200,000 samples.
############################################################
# First 200,000 inputs
############################################################
trials = 200000
population = []
for i in range(trials):
population.append(fuzzer.fuzz())
cumulative_coverage = []
all_coverage = set()
singletons = set() # Store the number of singletons
doubletons = set() # and doubletons for the extrapolation
for inp in population:
with Coverage() as cov:
try:
my_parser(inp)
except BaseException:
pass
this_coverage = set([getTraceHash(cov)])
doubletons -= this_coverage
doubletons |= singletons & this_coverage
singletons -= this_coverage
singletons |= this_coverage - all_coverage
all_coverage |= this_coverage
cumulative_coverage.append(len(all_coverage))
num_singletons = len(singletons)
num_doubletons = len(doubletons)
############################################################
# Additional 200,000 inputs
############################################################
extra_trials = 200000
validation = []
for i in range(extra_trials):
validation.append(fuzzer.fuzz())
for inp in validation:
with Coverage() as cov:
try:
my_parser(inp)
except BaseException:
pass
this_coverage = set([getTraceHash(cov)])
all_coverage |= this_coverage
cumulative_coverage.append(len(all_coverage))
fig, ax = plt.subplots()
coverage_ateach_1000trial = [0] + cumulative_coverage[
999::1000
] # for every 1000 trials
sub_coverage_until_half = coverage_ateach_1000trial[:201]
sns.lineplot(
x=range(0, 200001, 1000),
y=sub_coverage_until_half,
ax=ax,
)
ax.set_xlim(0, 400000)
ax.set_ylim(0, cumulative_coverage[-1])
ax.set_xticks([0, 100000, 200000, 300000, 400000])
ax.set_xlabel("Number of inputs")
ax.set_ylabel("Number of unique coverages")
plt.show()
fig, ax = plt.subplots()
sns.lineplot(
x=range(0, 400001, 1000),
y=coverage_ateach_1000trial,
ax=ax,
)
ax.set_xlabel("Number of inputs")
ax.set_ylabel("Number of unique coverages")
plt.show()
## formula:
## - \Phi_0 = (n - 1) / n * (\Phi_1^2 / (2 \Phi_2))
## - \Delta(m) = \Phi_0 [1 - (1 - \Phi_1/(n \Phi_0 + \Phi_1) )^m ]
############# IMPLEMENT HERE #############
phi1 = num_singletons
phi2 = num_doubletons
phi0 = (trials - 1) / trials * (phi1**2 / (2 * phi2))
extrapolator = lambda m: phi0 * (1 - (1 - phi1 / (trials * phi0 + phi1)) ** m)
extrapolated = [extrapolator(i) for i in range(1, 200001)]
extrapolated = [delta + sub_coverage_until_half[-1] for delta in extrapolated]
##########################################
fig, ax = plt.subplots()
sns.lineplot(
x=range(0, 200001, 1000),
y=sub_coverage_until_half,
ax=ax,
color="black",
)
sns.lineplot(
x=range(201000, 400001, 1000),
y=extrapolated[999::1000],
ax=ax,
color="blue",
dashes=True,
)
plt.show()
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# 1.
sns.lineplot(
x=range(0, 200001, 1000),
y=sub_coverage_until_half,
ax=axes[0],
)
axes[0].set_xlim(0, 400000)
axes[0].set_ylim(0, cumulative_coverage[-1])
axes[0].set_xticks([0, 100000, 200000, 300000, 400000])
axes[0].set_xlabel("Number of inputs")
axes[0].set_ylabel("Number of unique coverages")
# 2.
sns.lineplot(
x=range(0, 400001, 1000),
y=coverage_ateach_1000trial,
ax=axes[1],
)
axes[1].set_xlabel("Number of inputs")
axes[1].set_ylabel("Number of unique coverages")
# 3.
sns.lineplot(
x=range(0, 200001, 1000),
y=sub_coverage_until_half,
ax=axes[2],
color="black",
)
sns.lineplot(
x=range(201000, 400001, 1000),
y=extrapolated[999::1000],
ax=axes[2],
color="blue",
dashes=True,
)
sns.lineplot(
x=range(201000, 400001, 1000),
y=coverage_ateach_1000trial[201:],
ax=axes[2],
color="red",
)
plt.show()
💡 Go back to the slides
Reaching Probability: What is the probability of reaching a specific program element?¶
Let's consider an example of the control flow graph of a program.¶
Starting from small one with five statements.
from bn import SimpleBN
import numpy as np
np.random.seed(0)
sbn = SimpleBN(5)
display(sbn.draw())
min_prob = min(sbn.reach_prob_dict.values())
min_nidx = min(sbn.reach_prob_dict, key=sbn.reach_prob_dict.get)
print(f"Min prob: {min_prob} at node {min_nidx}")
Min prob: 0.00614593848538574 at node 3
Of course, there's some issue with using the bayesian network for CFGs.
- Using the bayesian network for CFGs means the branch conditions are independent.
- Which is not true in general.
But, it's a fair approximation for the exercises :)
Edge list:
display(sbn.edgeidx2edge)
{0: (0, 1), 1: (0, 4), 2: (1, 2), 3: (1, 4), 4: (2, 3), 5: (2, 4), 6: (3, 4)}
Generate execution
sbn.gen_obss(1)[0]
array([0, 1, 0, 0, 0, 0, 0], dtype=int8)
edge_cov = sbn.gen_obss(20)
display(edge_cov)
node_cov = sbn.edgecov2nodecov(edge_cov)
display(node_cov)
covered_nodes = np.logical_or.reduce(node_cov, 0).astype(np.int8)
covered_node_idxs = np.nonzero(covered_nodes)[0]
display(covered_node_idxs)
array([[0, 1, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0]], dtype=int8)
array([[1, 0, 0, 0, 1], [1, 1, 1, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 1, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 1, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]], dtype=int8)
array([0, 1, 2, 4])
Let's consider a little bit bigger program.
sbn = SimpleBN(30)
display(sbn.draw())
min_prob = min(sbn.reach_prob_dict.values())
min_nidx = min(sbn.reach_prob_dict, key=sbn.reach_prob_dict.get)
print(f"Min prob: {min_prob} at node {min_nidx}")
Min prob: 1.8651992743955428e-07 at node 21
num_exec = 1000
edge_cov = sbn.gen_obss(num_exec)
node_cov = sbn.edgecov2nodecov(edge_cov)
coverage_freq = np.sum(node_cov, 0)
coverage_freq_dict = {i: coverage_freq[i] for i in range(len(coverage_freq))}
print("Coverage frequencies:")
display(coverage_freq_dict)
covered_nodes = coverage_freq > 0
covered_node_idxs = set(np.nonzero(covered_nodes)[0])
uncovered_node_idxs = set(np.nonzero(1 - covered_nodes)[0])
print("Uncovered nodes:", sorted(uncovered_node_idxs))
for node_idx in sorted(uncovered_node_idxs):
print(f"Pr({node_idx}) = {sbn.reach_prob_dict[node_idx]}")
Coverage frequencies:
{0: 1000, 1: 651, 2: 603, 3: 349, 4: 48, 5: 300, 6: 45, 7: 3, 8: 124, 9: 5, 10: 1, 11: 49, 12: 0, 13: 8, 14: 37, 15: 0, 16: 3, 17: 8, 18: 7, 19: 46, 20: 0, 21: 0, 22: 490, 23: 0, 24: 46, 25: 38, 26: 121, 27: 492, 28: 309, 29: 40}
Uncovered nodes: [12, 15, 20, 21, 23] Pr(12) = 4.8272237587007876e-05 Pr(15) = 4.660665704523967e-06 Pr(20) = 6.127973733909666e-07 Pr(21) = 1.8651992743955428e-07 Pr(23) = 4.262774459514123e-07
Blackbox estimation¶
# Laplace estimator
alpha = 2
esti_laplace = 2 / (num_exec + 4)
# Good-Turing estimator
edge_cov_str = ["".join(map(str, row)) for row in edge_cov]
edge_cov_freq = {edge: edge_cov_str.count(edge) for edge in set(edge_cov_str)}
singletons = {k: v for k, v in edge_cov_freq.items() if v == 1}
est_goodturing = len(singletons) / num_exec
print(f"Estimated Laplace: {esti_laplace}")
print(f"Estimated Good-Turing: {est_goodturing}")
Estimated Laplace: 0.00199203187250996 Estimated Good-Turing: 0.004
Structure-aware estimation¶
For each uncovered statement,
- find the set of closest reached statements
- for each of them
- find the set of paths from the closest reached statement to the uncovered statement only through the uncovered statements
- compute the probability through the paths
- sum the probabilities
uncovered_node = sorted(uncovered_node_idxs)[0]
print(f"Node {uncovered_node} is uncovered.")
ancestor_nodes = sbn.get_ancestors(uncovered_node)
print(f"Ancestor nodes: {ancestor_nodes}")
############# IMPLEMENT HERE #############
reached_ancestor_nodes = ancestor_nodes & covered_node_idxs
print(f"Reached ancestor nodes: {reached_ancestor_nodes}")
##########################################
Node 12 is uncovered. Ancestor nodes: {0, 1, 4, 7, 10, 12} Reached ancestor nodes: {0, 1, 4, 7, 10}
def find_closest_reached_ancestor(
target_node: int, reached_ancestor_nodes: set, sbn: SimpleBN
) -> int:
if target_node in reached_ancestor_nodes:
return {target_node}
############# IMPLEMENT HERE #############
closest = set()
visited = set()
queue = [target_node]
while len(queue):
node = queue.pop(0)
if node in visited:
continue
visited.add(node)
if node in reached_ancestor_nodes:
closest.add(node)
else:
queue.extend(sbn.get_parents(node))
##########################################
return closest
closest_reached_ancestors = find_closest_reached_ancestor(
uncovered_node, reached_ancestor_nodes, sbn
)
print(f"Closest reached ancestor nodes: {closest_reached_ancestors}")
Closest reached ancestor nodes: {10}
def find_paths(
source_node: int, target_node: int, avoid_nodes: List[int], sbn: SimpleBN
):
############# IMPLEMENT HERE #############
paths = []
queue = [[source_node]]
while len(queue):
path = queue.pop(0)
node = path[-1]
if node == target_node:
paths.append(path)
else:
for child in sbn.get_children(node):
if child in avoid_nodes:
continue
queue.append(path + [child])
##########################################
return paths
for cra in sorted(closest_reached_ancestors):
paths = find_paths(cra, uncovered_node, covered_node_idxs, sbn)
print(f"Paths from {cra} to {uncovered_node}: {paths}")
Paths from 10 to 12: [[10, 12]]
def compute_prob(path: list, sbn: SimpleBN, n: int, start_freq: int) -> float:
############# IMPLEMENT HERE #############
prob = start_freq / n # empirical reaching probability of the start node
prob *= 2 / (start_freq + 4) # laplace smoothing
# print(f"Pr[{path[0]} -> {path[1]}] = {start_freq / n} * {2 / (start_freq + 4)} = {prob}")
if len(path) > 2:
for node in path[1:-1]:
# print(f"Num children of {node}: {len(sbn.get_children(node))}")
# print(f"Pr[{node} -> {path[path.index(node) + 1]}] = {1 / len(sbn.get_children(node))}", end=", ")
prob *= 1 / len(
sbn.get_children(node)
) # uniform probability for each child
# print(f"Pr = {prob}")
##########################################
return prob
total_prob = 0
for cra in sorted(closest_reached_ancestors):
paths = find_paths(cra, uncovered_node, covered_node_idxs, sbn)
for path in paths:
prob = compute_prob(path, sbn, num_exec, coverage_freq_dict[cra])
print(f"Probability of path {path}: {prob}")
total_prob += prob
print(f"Total probability of reaching node {uncovered_node}: {total_prob}")
Probability of path [10, 12]: 0.0004 Total probability of reaching node 12: 0.0004
# Another one
uncovered_node = None
print(f"Node {uncovered_node} is uncovered.")
ancestor_nodes = sbn.get_ancestors(uncovered_node)
print(f"Ancestor nodes: {ancestor_nodes}")
reached_ancestor_nodes = ancestor_nodes & covered_node_idxs
print(f"Reached ancestor nodes: {reached_ancestor_nodes}")
closest_reached_ancestors = find_closest_reached_ancestor(
uncovered_node, reached_ancestor_nodes, sbn
)
print(f"Closest reached ancestor nodes: {closest_reached_ancestors}")
total_prob = 0
for cra in sorted(closest_reached_ancestors):
paths = find_paths(cra, uncovered_node, covered_node_idxs, sbn)
print(f"Paths from {cra} to {uncovered_node}: {paths}")
for path in paths:
prob = compute_prob(path, sbn, num_exec, coverage_freq_dict[cra])
print(f"Probability of path {path}: {prob}")
total_prob += prob
print(f"Total probability of reaching node {uncovered_node}: {total_prob}")
Node None is uncovered.
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) Input In [27], in <cell line: 6>() 3 uncovered_node = None 4 print(f"Node {uncovered_node} is uncovered.") ----> 6 ancestor_nodes = sbn.get_ancestors(uncovered_node) 7 print(f"Ancestor nodes: {ancestor_nodes}") 9 reached_ancestor_nodes = ancestor_nodes & covered_node_idxs File ~/Documents/research/FuzzingSummerSchool/fuzzingbook/notebooks/bn.py:197, in SimpleBN.get_ancestors(self, node) 195 visited.add(curr_node) 196 ancestors.add(curr_node) --> 197 queue.update(self.get_parents(curr_node)) 198 return ancestors File ~/Documents/research/FuzzingSummerSchool/fuzzingbook/notebooks/bn.py:182, in SimpleBN.get_parents(self, node) 181 def get_parents(self, node: int) -> Set[int]: --> 182 return self.parent_map[node] KeyError: None
def structure_aware_esti(
uncovered_node: int, sbn: SimpleBN, coverage_freq_dict
):
ancestor_nodes = sbn.get_ancestors(uncovered_node)
covered_node_idxs = {k for k, v in coverage_freq_dict.items() if v > 0}
reached_ancestor_nodes = ancestor_nodes & covered_node_idxs
closest_reached_ancestors = find_closest_reached_ancestor(
uncovered_node, reached_ancestor_nodes, sbn
)
num_exec = sum([v for k, v in coverage_freq_dict.items()])
total_prob = 0
for cra in sorted(closest_reached_ancestors):
remaining_cras = closest_reached_ancestors - {cra}
paths = find_paths(cra, uncovered_node, remaining_cras, sbn)
for path in paths:
prob = compute_prob(path, sbn, num_exec, coverage_freq[cra])
total_prob += prob
return total_prob
import pandas as pd
df = pd.DataFrame(
{
"node": sorted(uncovered_node_idxs),
"true": [sbn.reach_prob_dict[n] for n in sorted(uncovered_node_idxs)],
"lap": [esti_laplace] * len(uncovered_node_idxs),
"good": [est_goodturing] * len(uncovered_node_idxs),
"struct": [
structure_aware_esti(n, sbn, coverage_freq_dict)
for n in sorted(uncovered_node_idxs)
],
}
)
display(df)
# draw barplot
df_melt = df.melt(id_vars="node", var_name="method", value_name="prob")
fig, ax = plt.subplots(figsize=(10, 5))
sns.barplot(x="node", y="prob", hue="method", data=df_melt, ax=ax)
plt.show()
fig, ax = plt.subplots(figsize=(10, 5))
sns.barplot(x="node", y="prob", hue="method", data=df_melt, ax=ax)
ax.set_yscale("log")
plt.show()
node | true | lap | good | struct | |
---|---|---|---|---|---|
0 | 12 | 4.827224e-05 | 0.001992 | 0.004 | 0.000083 |
1 | 15 | 4.660666e-06 | 0.001992 | 0.004 | 0.000041 |
2 | 20 | 6.127974e-07 | 0.001992 | 0.004 | 0.000021 |
3 | 21 | 1.865199e-07 | 0.001992 | 0.004 | 0.000010 |
4 | 23 | 4.262774e-07 | 0.001992 | 0.004 | 0.000010 |
💡 Go back to the slides