From f3f3dddfcbe9ff66e3391516ba9880713a25f29f Mon Sep 17 00:00:00 2001
From: Yan Wong <yan.wong@bdi.ox.ac.uk>
Date: Sat, 25 Apr 2026 11:20:10 +0100
Subject: [PATCH 1/3] Change the variable to `arg` in the ARG tute

And add arg_num_children example.
---
 args.md | 76 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 49 insertions(+), 27 deletions(-)

diff --git a/args.md b/args.md
index ebcb6c7a..8e13293d 100644
--- a/args.md
+++ b/args.md
@@ -49,7 +49,8 @@ coalescence in any of the local trees. Full ARGs can be stored and analysed in
 {func}`msprime:msprime.sim_ancestry` by specifying `coalescing_segments_only=False` along with
 `additional_nodes = msprime.NodeType.COMMON_ANCESTOR | msprime.NodeType.RECOMBINANT`
 (or the equivalent `record_full_arg=True`) as described
-{ref}`in the msprime docs<msprime:sec_ancestry_full_arg>`:
+{ref}`in the msprime docs<msprime:sec_ancestry_full_arg>`. To remind ourselves that this
+is a full ARG, we will use the variable `arg` rather than `ts`
 
 ```{code-cell}
 import msprime
@@ -62,7 +63,7 @@ parameters = {
     "random_seed": 333,
 }
 
-ts_arg = msprime.sim_ancestry(
+arg = msprime.sim_ancestry(
     **parameters,
     discrete_genome=False,  # the strict Hudson ARG needs unique crossover positions (i.e. a continuous genome)
     coalescing_segments_only=False,   # setting record_full_arg=True is equivalent to these last 2 parameters
@@ -71,8 +72,8 @@ ts_arg = msprime.sim_ancestry(
 
 print('Simulated a "full ARG" under the Hudson model:')
 print(
-    f" ARG stored in a tree sequence with {ts_arg.num_nodes} nodes and"
-    f" {ts_arg.num_edges} edges (creating {ts_arg.num_trees} local trees)"
+    f" ARG stored in a tree sequence with {arg.num_nodes} nodes and"
+    f" {arg.num_edges} edges (creating {arg.num_trees} local trees)"
 )
 ```
 
@@ -83,9 +84,9 @@ variation:
 ```{code-cell}
 import numpy as np
 mu = 1e-7
-ts_arg = msprime.sim_mutations(ts_arg, rate=mu, random_seed=888)
-print("     Sample node:  " + "   ".join(str(u) for u in ts_arg.samples()))
-for v in ts_arg.variants():
+arg = msprime.sim_mutations(arg, rate=mu, random_seed=888)
+print("     Sample node:  " + "   ".join(str(u) for u in arg.samples()))
+for v in arg.variants():
     print(f"Variable site {v.site.id}:", np.array(v.alleles)[v.genotypes])
 ```
 
@@ -120,7 +121,7 @@ require(["d3"], function(d3) {window.d3 = d3;});
 
 ```{code-cell} ipython3
 import tskit_arg_visualizer
-d3arg = tskit_arg_visualizer.D3ARG.from_ts(ts=ts_arg)
+d3arg = tskit_arg_visualizer.D3ARG.from_ts(ts=arg)
 w, h = 450, 300  # width and height
 d3arg.draw(w, h, edge_type="ortho", sample_order=[0, 2, 1, 3, 5, 4])
 ```
@@ -135,16 +136,16 @@ ancestor nodes (unlabelled) in blue.
 :"tags": ["hide-input"]
 # Plot the recombination nodes in red, with a horizontal line at the time of occurrence,
 # and only label nodes that are samples or recombination nodes.
-samples = set(ts_arg.samples())
-re_nodes = set(nd.id for nd in ts_arg.nodes() if nd.flags & msprime.NODE_IS_RE_EVENT)
-ca_nodes = set(np.arange(ts_arg.num_nodes)) - re_nodes - samples
-re_times = [int(nd.time) for nd in ts_arg.nodes() if nd.flags & msprime.NODE_IS_RE_EVENT]
+samples = set(arg.samples())
+re_nodes = set(nd.id for nd in arg.nodes() if nd.flags & msprime.NODE_IS_RE_EVENT)
+ca_nodes = set(np.arange(arg.num_nodes)) - re_nodes - samples
+re_times = [int(nd.time) for nd in arg.nodes() if nd.flags & msprime.NODE_IS_RE_EVENT]
 style = ".y-axis .grid {stroke: #ff000033} .mut .sym {stroke: goldenrod}"
 for u in re_nodes:
     style += f".n{u} > .sym {{fill: red}}"
 for u in ca_nodes:
     style += f".n{u} > .sym {{fill: blue}}"
-ts_arg.draw_svg(
+arg.draw_svg(
     size=(600, 300),
     y_axis=True,
     y_ticks=re_times,
@@ -210,7 +211,7 @@ rate and population size, using the {func}`msprime:msprime.log_arg_likelihood` m
 print(
     "Log likelihood of the genealogy under the Hudson model:",
     msprime.log_arg_likelihood(
-        ts_arg,
+        arg,
         recombination_rate=parameters["recombination_rate"],
         Ne=parameters["population_size"]
     )
@@ -235,7 +236,7 @@ This loses information about the timings of recombination and non-coalescent
 common ancestry, but it still keeps the local tree structure intact:
 
 ```{code-cell}
-ts = ts_arg.simplify()
+ts = arg.simplify()
 ts.draw_svg(
     size=(600, 300),
     y_axis=True,
@@ -268,7 +269,7 @@ the topology and branch lengths of the local trees remain unchanged after simpli
 
 ```{code-cell}
 print("Log likelihood of mutations given the genealogy:")
-print(' "full" ARG:',  msprime.log_mutation_likelihood(ts_arg, mutation_rate=mu))
+print(' "full" ARG:',  msprime.log_mutation_likelihood(arg, mutation_rate=mu))
 print(" simplified:", msprime.log_mutation_likelihood(ts, mutation_rate=mu))
 ```
 
@@ -290,17 +291,17 @@ its simplified version:
 ```{code-cell}
 large_sim_parameters = parameters.copy()
 large_sim_parameters["sequence_length"] *= 1000
-large_ts_arg = msprime.sim_ancestry(
+large_arg = msprime.sim_ancestry(
     **large_sim_parameters,
     discrete_genome=False,  # not technically needed, as we aren't calculating likelihoods
     coalescing_segments_only=False,
     additional_nodes=msprime.NodeType.COMMON_ANCESTOR | msprime.NodeType.RECOMBINANT,
 )
-large_ts = large_ts_arg.simplify()
+large_ts = large_arg.simplify()
 
 print(
     "Non-coalescent nodes take up "
-    f"{(1-large_ts.num_nodes/large_ts_arg.num_nodes) * 100:0.2f}% "
+    f"{(1-large_ts.num_nodes/large_arg.num_nodes) * 100:0.2f}% "
     f"of this {large_ts.sequence_length/1e6:g} megabase {large_ts.num_samples}-tip ARG"
 )
 ```
@@ -331,7 +332,7 @@ stored on *edges* (via the {attr}`~Edge.left` and  {attr}`~Edge.right` propertie
 rather than on nodes. Here, for example, is the edge table from our ARG:
 
 ```{code-cell}
-ts_arg.tables.edges
+arg.tables.edges
 ```
 
 Technically therefore, ARGs stored by `tskit` are edge-annotated
@@ -425,6 +426,27 @@ which are never children of an edge are not visited by this algorithm. Such
 nodes are either {ref}`isolated<sec_data_model_tree_isolated_nodes>` or a
 {ref}`root<sec_data_model_tree_roots>` in each local tree.
 
+(sec_args_counting_unique_children)=
+
+### Counting unique children
+
+Because edges are ordered first by parent ID, then by child ID, it is efficient
+to calculate the number of unique child IDs associated with a given parent.
+Above we passed over the edges using a `groupby` operation, but the canonical
+way to do this in the `numpy` library is to find where the group boundaries
+are, and count those:
+
+```{code-cell}
+def arg_num_children(ts):
+    # Return an array giving the number of unique child IDs for each node, anywhere in the genome.
+    # Relies on the fact that edges in a tree seq are sorted by parent ID then by child ID
+    same_parent = np.concatenate((ts.edges_parent[1:] == ts.edges_parent[:-1], [False]))
+    same_child  = np.concatenate((ts.edges_child[1:] == ts.edges_child[:-1], [False]))
+    is_last_unique = ~same_parent | ~same_child  # last occurrence of each unique child per parent
+    return np.bincount(ts.edges_parent[is_last_unique], minlength=ts.num_nodes)
+
+print(arg_num_children(arg))
+```
 
 (sec_args_other_analysis)=
 
@@ -458,26 +480,26 @@ def to_networkx_graph(ts, interval_lists=False):
     nx.set_node_attributes(G, {n.id: {'flags':n.flags, 'time': n.time} for n in ts.nodes()})
     return G
 
-arg = to_networkx_graph(ts_arg)
+graph = to_networkx_graph(arg)
 ```
 
 It is then possible to use the full range of networkx functions to analyse the graph:
 
 ```{code-cell} ipython3
-assert nx.is_directed_acyclic_graph(arg)  # All ARGs should be DAGs
-print("All descendants of node 10 are", nx.descendants(arg, 10))
+assert nx.is_directed_acyclic_graph(graph)  # All ARGs should be DAGs
+print("All descendants of node 10 are", nx.descendants(graph, 10))
 ```
 
 Networkx also has some built-in drawing functions: below is one of the simplest ones
 (for other possibilities, see the {ref}`sec_tskit_viz` tutorial).
 
 ```{code-cell} ipython3
-for layer, nodes in enumerate(nx.topological_generations(arg.reverse())):
+for layer, nodes in enumerate(nx.topological_generations(graph.reverse())):
     for node in nodes:
-        arg.nodes[node]["layer"] = layer
-pos = nx.multipartite_layout(arg, subset_key="layer", align='horizontal')
+        graph.nodes[node]["layer"] = layer
+pos = nx.multipartite_layout(graph, subset_key="layer", align='horizontal')
 
-nx.draw_networkx(arg, pos=pos)
+nx.draw_networkx(graph, pos=pos)
 ```
 
 ## Other software

From 7c8d073a6b0ea3c03ab739fce7d7e1c2769d6b25 Mon Sep 17 00:00:00 2001
From: Yan Wong <yan.wong@bdi.ox.ac.uk>
Date: Sun, 26 Apr 2026 12:22:22 +0100
Subject: [PATCH 2/3] Simpify the viz examples

Now that we have inherited_state
---
 viz.md | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/viz.md b/viz.md
index 3ff3fc69..9d28d961 100644
--- a/viz.md
+++ b/viz.md
@@ -337,19 +337,17 @@ See {ref}`sec_tskit_viz_dynamic_effects` if you want to dynamically hide and sho
 labels.
 
 ```{code-cell} ipython3
-nd_labels = {}  # An array of labels for the nodes
-for n in ts_mutated.nodes():
+nd_labels = {  # An array of labels for the nodes
     # Set sample node labels from metadata. Here we use the population name, but you might want
     # to use the *individual* name instead, if the individuals in your tree sequence have names
-    if n.is_sample():
-        nd_labels[n.id] = ts_mutated.population(n.population).metadata["name"]
+    n.id: ts_mutated.population(n.population).metadata["name"]
+    for n in ts_mutated.nodes()
+    if n.is_sample()
+}
 
-mut_labels = {}  # An array of labels for the mutations
-for mut in ts_mutated.mutations():  # Make pretty labels showing the change in state
-    site = ts_mutated.site(mut.site)
-    older_mut = mut.parent >= 0  # is there an older mutation at the same position?
-    prev = ts_mutated.mutation(mut.parent).derived_state if older_mut else site.ancestral_state
-    mut_labels[mut.id] = f"{prev}→{mut.derived_state}"
+mut_labels = { # An array of labels for the mutations
+    mut.id: f"{mut.inherited_state}→{mut.derived_state}" for mut in ts_mutated.mutations()
+}  
 
 ts_mutated.draw_svg(
     size=(1000, 300),
@@ -370,12 +368,8 @@ knowledge of the SVG graphics language (see the {ref}`sec_tskit_viz_legend_examp
 later in this tutorial for a node colour legend).
 
 ```{code-cell} ipython3
-mut_labels = {  # Alternative way of defining a label dictionary using a dict comprehension
-    mut.id: (
-        (ts_mutated.mutation(mut.parent).derived_state if mut.parent < 0 else site.ancestral_state) +
-        str(int(site.position)) +
-        mut.derived_state
-    )
+mut_labels = {
+    mut.id: f"{mut.inherited_state}{int(site.position)}{mut.derived_state}"
     for site in ts_mutated.sites() for mut in site.mutations
 }
 

From edc20483ca73314cc847dc2ab22dae7b80fe36fa Mon Sep 17 00:00:00 2001
From: Yan Wong <yan.wong@bdi.ox.ac.uk>
Date: Mon, 20 Apr 2026 21:47:16 +0100
Subject: [PATCH 3/3] Make a simplification tutorial

---
 _toc.yml                        |   2 +
 advanced_simplification.md      | 249 +++++++++++++++++++++++++++
 data/simplification_basic.trees | Bin 0 -> 8428 bytes
 simplification.md               | 295 +++++++++++++++++++++++++++++++-
 4 files changed, 540 insertions(+), 6 deletions(-)
 create mode 100644 advanced_simplification.md
 create mode 100644 data/simplification_basic.trees

diff --git a/_toc.yml b/_toc.yml
index a747a1aa..4d945aaa 100644
--- a/_toc.yml
+++ b/_toc.yml
@@ -19,6 +19,8 @@ parts:
   chapters:
   - file: tables_and_editing
   - file: simplification
+    sections:
+    - file: advanced_simplification
   - file: viz
   - file: metadata
   - file: args
diff --git a/advanced_simplification.md b/advanced_simplification.md
new file mode 100644
index 00000000..2d3f2061
--- /dev/null
+++ b/advanced_simplification.md
@@ -0,0 +1,249 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.12
+    jupytext_version: 1.9.1
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+```{currentmodule} tskit
+```
+
+(sec_advanced_simplification)=
+
+# _Advanced simplification_
+% remove underscores in title when tutorial is complete or near-complete
+
+
+This is a companion to the basic {ref}`sec_simplification` tutorial.
+It focuses on details of `simplify` behavior that are useful when you need precise
+control over node IDs, retained ancestry structure, and sample flags.
+
+If you are primarily subsetting samples or reducing tree sequence size, start with
+{ref}`sec_simplification`.
+
+
+```{code-cell} ipython3
+:"tags": ["hide-input"]
+# Build a full ARG for demonstrations
+import msprime
+import numpy as np
+import tskit
+import tskit_arg_visualizer as argviz
+
+arg = msprime.sim_ancestry(
+    samples=10,
+    sequence_length=1e4,
+    recombination_rate=1e-8,
+    population_size=1e4,
+    record_full_arg=True,
+    random_seed=123,
+)
+arg = msprime.sim_mutations(arg, rate=1e-8, random_seed=124)
+```
+
+:::{note}
+You can both simplify an immutable {class}`TreeSequence` (to return a new one),
+or run {meth}`~TableCollection.simplify` on a {class}`TableCollection` (which
+modifies in place, and requires {ref}`sec_table_indexes` to be built as well as the
+tables to be {meth}`sorted <TableCollection.sort>`). Simplifying tables in place
+is often useful for {ref}`forward-time simulations <sec_tskit_forward_simulations>`.
+:::
+
+## 1) Tracking node ID changes
+
+With default settings, simplification compacts tables and therefore reassigns node
+IDs. If downstream code makes use of stored node IDs, request a map:
+
+```{code-cell} ipython3
+focal = arg.samples()[-6:]  # pick last 6 samples (3 diploids)
+simp, node_map = arg.simplify(focal, map_nodes=True)
+
+print(f"Original ARG: {arg.num_individuals} individuals, simplified to {simp.num_individuals} individuals")
+print(f"Original ARG: {arg.num_nodes} nodes,  simplified to {simp.num_nodes} nodes")
+print("Dropped nodes:", int(np.sum(node_map == tskit.NULL)))
+print("Old sample ID", int(focal[0]), "maps to new ID", int(node_map[focal[0]]))
+```
+
+Note that when simplifying tables in-place using {meth}`TableCollection.simplify` a map
+is always returned. To avoid compacting the node table, and leave node IDs unchanged, use
+`filter_nodes=False`.
+
+### Obtaining the reverse map
+
+Often you might want a reverse map, mapping the new node IDs to the old ones. Here's
+a simple way to do this:
+
+```{code-cell} ipython3
+def invert_map(node_mapping):
+    kept = node_mapping != tskit.NULL
+    indexes = node_mapping[kept]  # indexes are guaranteed 0..N-1
+    rev_map = np.full_like(indexes, tskit.NULL)
+    rev_map[indexes] = np.flatnonzero(kept)
+    return rev_map
+
+reverse_map = invert_map(node_map)
+print("New sample ID 0", "maps to old ID", int(reverse_map[0]))
+```
+
+## 2) Keeping input roots
+
+:::{todo}
+This is easy to illustrate, and useful for forward sims / census approaches
+:::
+
+## 3) Setting sample flags
+
+Normally the nodes that are provided to the `simplify()` function are marked as sample
+nodes in the output (by setting the `NODE_IS_SAMPLE` flag), and other nodes have that flag unset.
+If you provide the `update_sample_flags=False` option, all node flags are left unchanged.
+Here are some cases where that can be useful.
+
+### Parallel simplification
+
+One use for the `update_sample_flags=False` option combines it with `filter_nodes=False`,
+to ensure that the node table remains untouched during simplification.
+This is primarily a use-case targetted at developers of forward simulators, and allows
+logically disjunct parts of the edge table to be simplified in parallel, without
+risking two parallel processes trying to alter the same data.
+
+:::{todo}
+Should we provide an example? However, this tends to be done at a lower level, e.g.
+using the C API, and is likely to be only useful for developers. 
+:::
+
+### Retaining non-coalescent nodes
+
+The `keep_unary=True` option retains *all* ancestral nodes after simplification, even
+if they don't represent coalescences. Sometimes you might want to retain *some* but not
+all such nodes, which can be done by adding them to the focal node list but using
+`update_sample_flags=False` to ensure they are not marked as samples in the final output.
+This is usually followed by an additional simplification pass to remove any topology
+above the additional nodes which is not ancestral to the true samples. Here are some examples:
+
+#### Keeping non-coalescent regions of coalescent nodes
+
+By default, `simplify()` deletes not just non-coalescent nodes, but also
+removes from the ancestry those regions of coalescent nodes which do not represent
+a coalescence in the local tree (i.e. are "locally unary"). We can identify coalescent
+nodes using an initial round of simplification:
+
+```{code-cell}
+simp, node_map = arg.simplify(focal, map_nodes=True)
+keep_nodes = np.where(node_map != tskit.NULL)[0]  #includes sample and coalescent nodes
+
+# Retain lots of nodes by treating them as focal, but don't change sample flags
+tmp_ts = arg.simplify(keep_nodes, update_sample_flags=False)
+# Now re-simplify, in case any coalescent nodes have unwanted ancestry above
+part_simp_ts = tmp_ts.simplify(keep_unary=True)
+```
+
+Often, leaving in the non-coalescent regions of coalescent nodes can lead to a reduction
+in the number of edges. This is one reason that the {meth}`TreeSequence.extend_haplotypes` exists
+(see the documentation of that method for more detail).
+
+```{code-cell} 
+print(f"Full simplification to samples {focal} leads to {simp.num_edges} edges")
+print(
+  f"Similar simplification leaving partially-coalescent nodes leads to {part_simp_ts.num_edges} edges"
+)
+part_simp_ts.draw_svg(title=(
+    f"ARG simplified to samples {focal}, leaving part-coalescent nodes: "
+    f"this has {part_simp_ts.num_edges} edges"
+))
+```
+
+
+#### Subsetting a full ARG
+
+The {ref}`sec_args` tutorial discusses the idea of a "full ARG", containing recombination and 
+non-coalescent common ancestor nodes, which do not correspond to coalescent events anywhere in 
+the genealogy. We can use `simplify()` to reduce a full ARG to a subset of samples, while retaining those non-coalescent nodes. Again, we need to figure out
+which nodes to keep, then add them to the focal nodes, simplifying with `update_sample_flags=False`.
+
+We can detect the common ancestor nodes by finding those which still have 2 unique children after
+simplifying with `keep_unary=True` and the recombination nodes
+by finding which still come in pairs after such simplification.
+
+```{code-cell} ipython3
+def arg_num_children(ts):  # see the ARG tutorial which explains this code
+    same_parent = np.concatenate((ts.edges_parent[1:] == ts.edges_parent[:-1], [False]))
+    same_child  = np.concatenate((ts.edges_child[1:] == ts.edges_child[:-1], [False]))
+    is_last_unique = ~same_parent | ~same_child  # last occurrence of each unique child per parent
+    return np.bincount(ts.edges_parent[is_last_unique], minlength=ts.num_nodes)
+
+re_node_pairs = np.where(arg.nodes_flags & msprime.NODE_IS_RE_EVENT)[0].reshape(-1, 2)
+print("RE nodes before simplification\n", re_node_pairs)
+```
+
+Identifying the _msprime_ recombination nodes that stay as pairs after simplification requires
+a little work:
+
+:::{todo}
+Currently the code below doesn't quite work, because `keep_unary` forces the nodes above the local
+roots to be kept, see https://github.com/tskit-dev/tskit/issues/3450. This means that some RE
+(and possibly CA) nodes are kept when they should be discarded. 
+:::
+
+```{code-cell} ipython3
+simp, node_map = arg.simplify(focal, keep_unary=True, map_nodes=True)
+reverse_map = invert_map(node_map)
+simp_re_nodes = reverse_map[np.where(simp.nodes_flags & msprime.NODE_IS_RE_EVENT)[0]]
+valid_pairs = np.isin(re_node_pairs, simp_re_nodes).all(axis=1)
+keep_re_nodes = re_node_pairs[valid_pairs, :]
+print("paired RE nodes after simplification\n", keep_re_nodes)
+
+keep_RE_nodes = keep_re_nodes.flatten()
+keep_CA_nodes = reverse_map[arg_num_children(simp) > 1]
+```
+
+Now that we have defined which nodes to keep, we can use the same trick as before,
+passing these nodes as focal, but simplifying twice, once with `update_sample_flags=False`
+then again with `keep_unary=True`:
+
+```{code-cell} ipython3
+keep = np.concatenate((focal, keep_CA_nodes, keep_RE_nodes))
+
+tmp_arg = arg.simplify(keep, update_sample_flags=False)
+subset_arg = tmp_arg.simplify(keep_unary=True)  # Defaults to focal nodes = existing samples
+```
+
+Here's what it looks like in graph form:
+
+```{code-cell} ipython3
+d3arg = argviz.D3ARG.from_ts(ts=subset_arg)
+d3arg.draw(title=f"A full ARG, subset to {subset_arg.num_samples} samples");
+```
+
+## 4) Keeping individuals
+
+In some cases, a tree sequence might contain historical individuals which are associated
+with nodes that are not samples, and you wish to retain information on individuals which are
+ancestral to the sample nodes. For example a forward-time simulation could
+define individuals for all nodes in the past, including the pedigree links between parents
+and children (see also discussion in the SLiM manual, and at
+https://github.com/MesserLab/SLiM/issues/139).
+
+To keep all the individuals associated with genetic ancestry, you can use
+`keep_unary_in_individuals=True`. 
+
+:::{todo}
+Should we have a demonstration here? {ref}`sec_tskit_forward_simulations` could be used to
+create a simulator that saves pedigree information into each individual, and we could distill
+some of the discussion from https://github.com/MesserLab/SLiM/issues/139 into that.
+:::
+
+## 5) reduce_to_site_topology
+
+:::{todo}
+Explain why you might use `reduce_to_site_topology`, e.g. if there is a lot of information
+that is not inferable from sites. Note that this does not change the topology at each site,
+although some of the topology at a site might not actually be needed to represent the site
+variation (to see how to condense topology into "polytomies", see
+https://github.com/tskit-dev/tskit/discussions/2926).
+:::
\ No newline at end of file
diff --git a/data/simplification_basic.trees b/data/simplification_basic.trees
new file mode 100644
index 0000000000000000000000000000000000000000..059fd822709a0650450e9c6d0e7f629b73b86053
GIT binary patch
literal 8428
zcmeHMU#J{s72m2gF($QI|0Ncgj(I3@_s{IzyEk`(vX?YaTSG&dT1#oCb7#N1cbfUL
zelvTU+v}BpMQW|a$Gi(t=!1bGErO6DV!#LcVEbxN#6SzxCo9@N=ggen&fLjvb{eo$
zaA7m^opXNY|Mz`oZ+>|1>iq}y9olzOtycRwecv$XpZD?kn8;rGRQB>oS@%Diz1$+}
zS&vT$<M)E!C+o8n@$5H$UKKp-T@`%Bo&E34!upHgKPPM8g*;OM?7vmQ-w8!Ckr@N|
zJ8)~Z-Y@dNBc54mkiUb1NB$oZ`}6S7d$@#uMexuEx$+bFYYHCrt_c1kWk6|-_`fFj
z+l7C%qW{1@B=|#uH$I=ez@L)Y$rAp@g8#H|zAbp@p8!fP$p2G<N84=(z9fFl680|$
z{!1bOJowiDg@FEhp=AHu`?UYNl6~N-+W&J2f1#p%<o9jC-zITY#lyc_rEsV}@ZitN
zALw5d{9Ykktmr@RZNcNbsESAYt2c;8|DC)+Jo?8LJn~=Ff8<vS9{C3z`jMyd6Z~gO
z_?If?ANl)vDgLT><nNb)mo}_b#l!w@1uu`vc-VjK2K`6={wnw{NP^-2yz>|J`;Xwi
zEci>3|BsdnR$9aUq0eLtdDLF1SU=cb5a2=44}4Yo$4mGt744&bj|v|51@QCo2fQzM
z^v_kXUlsq|lKsC|#G`+nDdFF$h)4b}2p;$SUTGwpXQi@FmWcmnrTFizh)4gtQ1bt2
zH1fpK^cVR5vfyQGr~!|9Oh?yhu>V>Kf4QQ4#Q%CJe&FZj5B2-E;4%O1{bJ_vJUslr
zSq^T@ALj&r$E?7Gj{F@IJm!z*1V68TVZT$tUljZ==GX`RKEX>GYLCf5aOvaMR{;MV
z!Q=cej0{)x{CHCE=%1G0JJ&Z~gZ=Y@$Naic(LU<`yx>vqvw}x{X1M|WuS))(6a2jT
z!T&#${C~Ql|FHkM;BkIl5d18^L5KZ+mHY?3YW;4J#z+00t7sqj|BB$@Kk)PNhd3<3
zWB<M+_Hmv<Zr1at;BlT@7X0B^0HLeFP{mDm!}AB(tFP9R>@84<GjI}T_sWje*JD)@
zv14~b1tK|ev<gK)d&7-nJ<@@bG(#s)^17*X>_wp$4%AsP&lYSijFnD^YDapY^e&!O
zLA2@k6OOm(4aSZiH`je<7?bFP;76{*N~Y1o^&$%~PNQ8%T;eUFvZ#0YA|%lRQ+SGw
zz#Hn+<G49caZ*@vB??R6rjTGER|Vl~jfBxa6;z}JDy-vZtlgqO;sq)gCleR7x2Xnp
zOoUlvcryXPnOqGhGezO|L`UlOWR+z$*B5(5Z`fs)m-fhB^IBzVxUhRA4wZ;mcW1;=
zG#dM9HnKGv%V}{FdZ*Du`&3qDW2B=^6*{3y&THk;PPmo=Ed^0_f=UvMiqaFJ0Alr>
zu_Dd3ufk!n!8wbYsi{QM-bzb4l?Z$aKzx}bR#Qc!h?(Rlq9<)k_R1cIUJ{ST-XNWy
z_Rv|x=QN*Pe4gVsG+*6(zVeyK=OF)Wi_g;o^!;D^a7_H=cn}lfLTnr#$Hrg2M{t+a
z`se$!tbGvABtGLMjYKOIrzY7at`C3suIBs~YsEEUK4L>m#aQ^AV`Lu3&a#YQEZ3Xs
z&N^8)<^g*BuYOjK<?0bvPRZZcKlbZg`7np&7k~Y?ms?J!AH2PN(*r-+=`-zmU&dAW
zKnL4It~ocbKkZpX?}uIH2P60VGq)e^|I}&z@!gBf{@veb?s=+lPyfW_JzM|$#{>N<
zr`q?uIeM`F&iD5p-Tx=m-}mB+=5K!dY=7INbKce7h|d<YXPOHp&l&Var>xyFcM5^u
zLGRn9GZ>@?k>j7#(TGP56`Q?8kJW<d(yieV_VJjMEtqtgP6SGZw#{T~L}nf~<Ai3F
zA=@4MzWK-w9DoD_c8H_t2==rZFfMg-VVC=lEQnS!T6<i%33<P5W*37kBa_)POsmmq
zw9Ey=jHC7B37WQuxGiBDW)P1wow3Z{h3Pb`HOt%~GTok(qJ(C!6zE1l;DqE8Df9wc
zZiKDW09`3Qr^o0Jjs}N_hLzFA77;jFc47CWLJX_bYO%0KWg#Q>knUnG2oQX&1#4cI
z-k~<lJ_R1hD%*`haeNM!t}Qo_+bq|g->!D-J;}-!Q-?-=<P8d*2UKf6hk0}*dAJ61
zGTn+gvH8-<X(}9)DA`qQ=W|ppEUa}(m+k0b&dB@FPA7?+<3<i8-KDOwV>P5%CW-AR
z^b5X+<G}WvHRW?@q!OBH7zKpj_A)wcE>1#wz@Ze;Q0&`*cAbK_8u=d0$~<_}6i$p5
z_l62LJspK?lt|lVl=Vow71M-86!R`eKjG2lFiiHEDGxbeTtCsScN(c3W@+UsCr-KT
zPNSQ3YoQx6CC0r<hsrm;mHiqpGSQxKA_|Rr#=c>749n^*wt7pJaq_OyM!VHsI%?=B
zN_uC*aow^y)^fw@v>M%&?^(@L$G>@e_5S1b>Zudmj(z%wWwqIrz@Z;EVd{#bgKj6=
z0b9ui9~Ts$#~se$rv1qA96OnD30!b_v0=3v79A)=_PsUj==9tm4QCw6vuwfwfTSDQ
zeoODv)L4Pdn>b?+V{XIkqOo@4h`B?b4}SMgW@^qIj{($B+1Qx+X!Gu*JrO3h&AFoW
zSo6<4n|jnvlOsFYQuS7s^S^tWRohQ9505iSjmYlQmS1dJ|L<UK>JL)~^Y!}<4KVmS
z9PQz5z169=Tc@p+p4I7DD~*+<_TplDI;XR;WZ??u{~PLH!B3itkFDkeo}GARn)$gB
x_EX-SKk4&LBfq;-QrlTs?k+74>fN^6st?w?Yjt<oS+2X*O53$u_egiydJl1z55WKc

literal 0
HcmV?d00001

diff --git a/simplification.md b/simplification.md
index d5464647..10c099c6 100644
--- a/simplification.md
+++ b/simplification.md
@@ -16,17 +16,300 @@ kernelspec:
 
 ```{code-cell} ipython3
 :tags: [remove-cell]
-def create_notebook_data():
-    pass
+import msprime
 
+def create_notebook_data():
+    ts = msprime.sim_ancestry(
+      samples=4,
+      sequence_length=1_000,
+      population_size=300,
+      random_seed=42,
+    )
+    ts = msprime.sim_mutations(ts, rate=1e-6, random_seed=321)
+    assert ts.num_mutations == 2
+    site_pos = ts.simplify([0, 1, 2]).sites_position
+    assert len(site_pos) == 1 and site_pos[0] == ts.sites_position[1]
+    ts.dump("data/simplification_basic.trees")
 # create_notebook_data()  # uncomment to recreate the tree seqs used in this notebook
 ```
 
 (sec_simplification)=
 
-# _Simplification_
-% remove underscores in title when tutorial is complete or near-complete
+# Simplification
+
+The {meth}`~TreeSequence.simplify` method provides one of the most powerful ways to modify a
+[tskit](https://tskit.dev) {class}`TreeSequence`. It removes and modifies edges to leave only the
+ancestry of a provided set of focal nodes. By default it ensuring these focal nodes are marked as
+samples and removes non-ancestral nodes and associated objects such as individuals and populations.
+It is commonly used:
+
+* In forward simulations, to remove lineages that have gone extinct
+* To create a smaller tree sequence focussed on a subset of samples
+* To remove redundant nodes and other tskit objects (e.g. unreferenced populations)
+
+Other less common uses, such as retaining unary regions of coalescent nodes, and
+simplification in parallel, are described in the {ref}`sec_advanced_simplification` tutorial.
+
+
+## A single tree example
+
+We start with a very small example for ease of visualisation. This is
+a tree sequence consisting of a single tree with 8 haploid genomes
+(4 diploid individuals) and 2 variable sites.
+
+```{code-cell} ipython3
+import tskit
+ts = tskit.load("data/simplification_basic.trees")
+plot_params = {"size": (500, 200), "time_scale": "log_time", "y_axis":True, "y_ticks": [0, 10, 100, 1000]}
+ts.draw_svg(**plot_params)
+```
+
+Suppose we only want to retain the ancestry of sample nodes 0, 1, and 2. We can do this
+by passing those IDs as the new samples to the {meth}`~TreeSequence.simplify` method:
+
+```{code-cell} ipython3
+focal_node_ids = [0, 1, 2]
+ts_simp1 = ts.simplify(samples=focal_node_ids)
+ts_simp1.draw_svg(**plot_params)
+```
+
+Restricting the sample nodes to [0, 1, 2] makes the ancestry much simpler.
+Note that one of the mutations is not relevant to the new samples, so it has been
+filtered out, causing the ID of the remaining mutation to change from 1 to 0.
+Similarly, many nodes have been filtered out, resulting in changed node IDs
+(e.g. the root node at time 1000 is the same in both trees, but its ID has
+changed from 14 to 4).
+
+To keep node IDs the same, you can specify `filter_nodes=False`. Although this makes
+the result easier to compare with the original, it is not generally recommended, as it
+leaves redundant nodes cluttering up the tree sequence.
+
+```{code-cell} ipython3
+ts_simp2 = ts.simplify(focal_node_ids, filter_nodes=False, filter_sites=False)
+# Node IDs should now remain unchanged
+ts_simp2.draw_svg(**plot_params)
+```
+
+Note that the example above also used another `filter_` argument, setting
+`filter_sites=False`, so that the first site, which has no mutations after
+simplification, is also retained (it is shown as a bare tick mark on the X axis,
+around position 250). However, mutations above unused nodes are still deleted
+so mutation IDs are not guaranteed to stay the same.
+
+To further reduce the size of the simplified tree sequence, simplification normally
+removes nodes from the ancestry that no longer represent branch points (coalescences).
+We can leave those in using `keep_unary=True`.
+
+```{code-cell} ipython3
+ts_simp3 = ts.simplify(focal_node_ids, filter_nodes=False, keep_unary=True)
+ts_simp3.draw_svg(**plot_params)
+```
+
+:::{note}
+As modifying a tree sequence can change the IDs of nodes, sites, and other objects,
+it can be useful to use {ref}`metadata <sec_tutorial_metadata>`:
+information that stays associated with tskit objects even when their IDs change.
+When simplifying, it is also possible to keep track of node ID changes by using
+the `map_nodes` parameter, as demonstrated later in this tutorial.
+:::
+
+## A larger simplification example
+
+Here we examine the impact of simplification on the efficiency of tree sequence
+storage and processing. We'll start with a larger backward simulation that has
+a handful of admixed individuals:
+
+```{code-cell} ipython3
+demography = msprime.Demography()
+demography.add_population(name="SMALL", initial_size=1000)
+demography.add_population(name="BIG", initial_size=4000)
+demography.add_population(name="ADMIX", initial_size=500)
+demography.add_population(name="ANC", initial_size=1500)
+demography.add_admixture(
+    time=100, derived="ADMIX", ancestral=["SMALL", "BIG"], proportions=[0.5, 0.5]
+)
+demography.add_population_split(time=1_000, derived=["SMALL", "BIG"], ancestral="ANC")
+
+big_ts = msprime.sim_ancestry(
+  samples={"SMALL": 400, "BIG": 400, "ADMIX": 6, "ANC": 400},
+  demography=demography,
+  sequence_length=5e7,
+  recombination_rate=2e-8,
+  random_seed=2432,
+)
+big_ts = msprime.sim_mutations(big_ts, rate=1e-8, random_seed=6151)
+print(
+  f"`big_ts` represents a simulation with admixture of {big_ts.num_samples} samples",
+  f"over {big_ts.sequence_length/1e6:g} Mb ({big_ts.num_trees} trees)",
+)
+```
+
+## Use case 1: remove historical samples
+
+Here, about a third of the sample nodes (those from the `ANC` population) exist
+at times prior to the current generation, i.e. they are *historical* sample nodes.
+In fact, in forward simulations most nodes will be of this sort, left over from
+previously simulated generations. These are often unwanted, and one of the main
+use cases for simplification is to reduce the ancestry to that of just the
+contemporary genomes: i.e. removing any edges, nodes, and mutations that track
+"extinct" lineages.
+
+```{code-cell} ipython3
+modern_sample_ids = big_ts.samples(time=0)
+ts_modern = big_ts.simplify(modern_sample_ids)
+print(f"Tree sequence simplified to {ts_modern.nbytes/big_ts.nbytes:.1%} of original size")
+```
+
+Simplifying has only produced a modest reduction in size, but you can imagine that
+in a forward simulation, where the majority of genomes are historical, repeated
+simplification can result in huge savings. In practice, simulators usually do this
+regular simplification of the tables used to store the paths of genetic inheritance
+automatically, so using
+{meth}`~TableCollection.simplify` in this way is mainly of interest if you are
+{ref}`building your own forward simulator <sec_tskit_forward_simulations>`.
+
+## Use case 2: simplify to a subset of samples
+
+Often you might want to focus on only one group of samples, for example the small
+group of `ADMIX` individuals (population ID 2 in this simulation):
+
+```{code-cell} ipython3
+admix_pop_id = 2
+assert big_ts.population(admix_pop_id).metadata["name"] == "ADMIX"
+admix_sample_ids = big_ts.samples(population=admix_pop_id)
+
+ts_admix, node_map = big_ts.simplify(admix_sample_ids, map_nodes=True)
+print(f"Tree sequence simplified to {ts_admix.nbytes/big_ts.nbytes:.2%} of original size")
+print()
+print(f"Previous admixed sample IDs were {admix_sample_ids}")
+print(f"Simplifying has changed these to {node_map[admix_sample_ids]}")
+
+# Check that these are indeed the only sample IDs
+assert set(node_map[admix_sample_ids]) == set(ts_admix.samples())
+```
 
-:::{todo}
-Create content. See https://github.com/tskit-dev/tutorials/issues/52
+:::{note}
+The `map_nodes=True` argument means that `simplify()` returns both a new
+tree sequence and an array mapping each old node ID to its new ID, or to
+`tskit.NULL` if that node is removed.
+Here you can see that (unlike in previous examples) the sample node IDs
+have changed: unless `filter_nodes=False`, the _N_ node IDs provided as the `samples`
+argument will be allocated new IDs from 0 to _N_ - 1 in the returned tree sequence (so simplify can be used to reorder sample IDs, although
+{meth}`~TreeSequence.subset` is a way to do this with fewer side effects).
 :::
+
+### Efficiency
+
+Edges take up the majority of the space in most tree sequences. In this case you can
+see that although simplify has reduced the sample nodes to 12 genomes from
+the 6 diploid `ADMIX` individuals (a reduction of 99.5%), the number of edges
+has not been reduced by such a large amount.
+That's because many of the ancestors of the SMALL and BIG populations are also shared
+by `ADMIX`. It also shows why tree sequence structures are so effective for encoding
+and analysing large datasets: storage and processing efficiency, in particular the
+number of edges, is sub-linear in the number of samples.
+
+```{code-cell} ipython3
+print(
+    f"The simplified tree sequence has only {ts_admix.num_samples / big_ts.num_samples:.2%} of the samples,",
+    f"but retains {ts_admix.num_edges / big_ts.num_edges:.2%} of the edges."
+)
+```
+
+If you want to analyse only the admixed individuals, using the simplified tree sequence
+is much more efficient than running equivalent operations on the original `big_ts`:
+
+```{code-cell} ipython3
+%%timeit
+# Speed test for decoding all the genetic variants of the admixed individuals
+for v in ts_admix.variants():
+    pass
+```
+
+Identical results can be obtained using the full tree sequence and restricting calculations to the `admix_sample_ids`, but this approach is much slower:
+
+```{code-cell} ipython3
+%%timeit
+# Equivalent processing of admixed individuals, using the full tree sequence
+for v in big_ts.variants(samples=admix_sample_ids):
+    pass
+```
+
+The same efficiencies apply to calculating statistics on subsets of genomic samples.
+As simplification has been highly optimised in `tskit`, if you perform repeated
+processing of the same subset of genomes, it can be worth simplifying before
+processing.
+
+### Removing other unused objects
+
+If we print out the original and admix-only (simplified) tree sequence, we can see
+that a number of other tables have also been reduced in size. For instance,
+simplification has reduced the number of individuals from 1206 to 6, and the
+number of sites to less than a sixth of the original.
+
+```{code-cell} ipython3
+print("Original tree sequence")
+big_ts
+```
+
+```{code-cell} ipython3
+print("Simplified tree sequence")
+ts_admix
+```
+
+Note that the call to {meth}`TreeSequence.simplify` has been recorded in the
+{ref}`sec_provenance` information. Like most tree sequence methods, you can pass
+`record_provenance=False` if you want this to be omitted (which will save space, but not
+lead to other efficiency gains).
+
+On closer inspection, you might be surprised to see that there are still 4 populations in
+the simplified tree sequence, although it contains only samples from the `ADMIX` population:
+
+```{code-cell} ipython3
+print(
+    "Sample nodes in `ts_admix` belong to the following populations",
+    ts_admix.tables.nodes.population[ts_admix.samples()],
+)
+ts_admix.tables.populations
+```
+
+The reason that the other populations (`BIG`, `SMALL`, and `ANC`) have been retained is
+that the simulation has assigned populations to both sample and nonsample nodes. If we
+{ref}`edit <sec_tables_editing>` the tree sequence tables such that ancestral
+(non-sample) genomes are not associated with defined populations, then simplification will
+remove all but the admixed population (and reassign the population IDs as
+necessary).
+
+An example of this is given in the code below, which performs a further round of simplification,
+taking advantage of the fact that if a list of focal nodes is not given, `simplify`
+uses the existing sample nodes.  
+
+```{code-cell} ipython3
+import numpy as np
+import tskit
+
+tables = ts_admix.dump_tables()
+samples = ts_admix.samples()
+# Make an array of NULL population values for each node
+nodes_population = np.full_like(tables.nodes.population, tskit.NULL)
+# Set the sample node populations back to their expected population
+nodes_population[samples] = ts_admix.nodes_population[samples]
+tables.nodes.population = nodes_population
+tables.simplify()  # This is the tables version of simplify, often used in forward sims
+ts_admix_only = tables.tree_sequence()
+
+print(
+    "Sample nodes in `ts_admix_only` belong to the following populations",
+    ts_admix_only.tables.nodes.population[ts_admix_only.samples()],
+)
+ts_admix_only.tables.populations
+```
+
+Although reducing the number of populations saves space, it requires care.
+For instance `admix_pop_id` can no longer be used to refer to the correct ID
+in the `ts_admix_only` tree sequence.
+
+## Extra uses for simplification
+
+Simplify is somewhat of a "Swiss Army knife" for tree sequences, and can be used in
+several other ways. See the {ref}`sec_advanced_simplification` tutorial for more details.