From f3f3dddfcbe9ff66e3391516ba9880713a25f29f Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Sat, 25 Apr 2026 11:20:10 +0100 Subject: [PATCH 1/3] Change the variable to `arg` in the ARG tute And add arg_num_children example. --- args.md | 76 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 27 deletions(-) diff --git a/args.md b/args.md index ebcb6c7a..8e13293d 100644 --- a/args.md +++ b/args.md @@ -49,7 +49,8 @@ coalescence in any of the local trees. Full ARGs can be stored and analysed in {func}`msprime:msprime.sim_ancestry` by specifying `coalescing_segments_only=False` along with `additional_nodes = msprime.NodeType.COMMON_ANCESTOR | msprime.NodeType.RECOMBINANT` (or the equivalent `record_full_arg=True`) as described -{ref}`in the msprime docs`: +{ref}`in the msprime docs`. To remind ourselves that this +is a full ARG, we will use the variable `arg` rather than `ts` ```{code-cell} import msprime @@ -62,7 +63,7 @@ parameters = { "random_seed": 333, } -ts_arg = msprime.sim_ancestry( +arg = msprime.sim_ancestry( **parameters, discrete_genome=False, # the strict Hudson ARG needs unique crossover positions (i.e. a continuous genome) coalescing_segments_only=False, # setting record_full_arg=True is equivalent to these last 2 parameters @@ -71,8 +72,8 @@ ts_arg = msprime.sim_ancestry( print('Simulated a "full ARG" under the Hudson model:') print( - f" ARG stored in a tree sequence with {ts_arg.num_nodes} nodes and" - f" {ts_arg.num_edges} edges (creating {ts_arg.num_trees} local trees)" + f" ARG stored in a tree sequence with {arg.num_nodes} nodes and" + f" {arg.num_edges} edges (creating {arg.num_trees} local trees)" ) ``` @@ -83,9 +84,9 @@ variation: ```{code-cell} import numpy as np mu = 1e-7 -ts_arg = msprime.sim_mutations(ts_arg, rate=mu, random_seed=888) -print(" Sample node: " + " ".join(str(u) for u in ts_arg.samples())) -for v in ts_arg.variants(): +arg = msprime.sim_mutations(arg, rate=mu, random_seed=888) +print(" Sample node: " + " ".join(str(u) for u in arg.samples())) +for v in arg.variants(): print(f"Variable site {v.site.id}:", np.array(v.alleles)[v.genotypes]) ``` @@ -120,7 +121,7 @@ require(["d3"], function(d3) {window.d3 = d3;}); ```{code-cell} ipython3 import tskit_arg_visualizer -d3arg = tskit_arg_visualizer.D3ARG.from_ts(ts=ts_arg) +d3arg = tskit_arg_visualizer.D3ARG.from_ts(ts=arg) w, h = 450, 300 # width and height d3arg.draw(w, h, edge_type="ortho", sample_order=[0, 2, 1, 3, 5, 4]) ``` @@ -135,16 +136,16 @@ ancestor nodes (unlabelled) in blue. :"tags": ["hide-input"] # Plot the recombination nodes in red, with a horizontal line at the time of occurrence, # and only label nodes that are samples or recombination nodes. -samples = set(ts_arg.samples()) -re_nodes = set(nd.id for nd in ts_arg.nodes() if nd.flags & msprime.NODE_IS_RE_EVENT) -ca_nodes = set(np.arange(ts_arg.num_nodes)) - re_nodes - samples -re_times = [int(nd.time) for nd in ts_arg.nodes() if nd.flags & msprime.NODE_IS_RE_EVENT] +samples = set(arg.samples()) +re_nodes = set(nd.id for nd in arg.nodes() if nd.flags & msprime.NODE_IS_RE_EVENT) +ca_nodes = set(np.arange(arg.num_nodes)) - re_nodes - samples +re_times = [int(nd.time) for nd in arg.nodes() if nd.flags & msprime.NODE_IS_RE_EVENT] style = ".y-axis .grid {stroke: #ff000033} .mut .sym {stroke: goldenrod}" for u in re_nodes: style += f".n{u} > .sym {{fill: red}}" for u in ca_nodes: style += f".n{u} > .sym {{fill: blue}}" -ts_arg.draw_svg( +arg.draw_svg( size=(600, 300), y_axis=True, y_ticks=re_times, @@ -210,7 +211,7 @@ rate and population size, using the {func}`msprime:msprime.log_arg_likelihood` m print( "Log likelihood of the genealogy under the Hudson model:", msprime.log_arg_likelihood( - ts_arg, + arg, recombination_rate=parameters["recombination_rate"], Ne=parameters["population_size"] ) @@ -235,7 +236,7 @@ This loses information about the timings of recombination and non-coalescent common ancestry, but it still keeps the local tree structure intact: ```{code-cell} -ts = ts_arg.simplify() +ts = arg.simplify() ts.draw_svg( size=(600, 300), y_axis=True, @@ -268,7 +269,7 @@ the topology and branch lengths of the local trees remain unchanged after simpli ```{code-cell} print("Log likelihood of mutations given the genealogy:") -print(' "full" ARG:', msprime.log_mutation_likelihood(ts_arg, mutation_rate=mu)) +print(' "full" ARG:', msprime.log_mutation_likelihood(arg, mutation_rate=mu)) print(" simplified:", msprime.log_mutation_likelihood(ts, mutation_rate=mu)) ``` @@ -290,17 +291,17 @@ its simplified version: ```{code-cell} large_sim_parameters = parameters.copy() large_sim_parameters["sequence_length"] *= 1000 -large_ts_arg = msprime.sim_ancestry( +large_arg = msprime.sim_ancestry( **large_sim_parameters, discrete_genome=False, # not technically needed, as we aren't calculating likelihoods coalescing_segments_only=False, additional_nodes=msprime.NodeType.COMMON_ANCESTOR | msprime.NodeType.RECOMBINANT, ) -large_ts = large_ts_arg.simplify() +large_ts = large_arg.simplify() print( "Non-coalescent nodes take up " - f"{(1-large_ts.num_nodes/large_ts_arg.num_nodes) * 100:0.2f}% " + f"{(1-large_ts.num_nodes/large_arg.num_nodes) * 100:0.2f}% " f"of this {large_ts.sequence_length/1e6:g} megabase {large_ts.num_samples}-tip ARG" ) ``` @@ -331,7 +332,7 @@ stored on *edges* (via the {attr}`~Edge.left` and {attr}`~Edge.right` propertie rather than on nodes. Here, for example, is the edge table from our ARG: ```{code-cell} -ts_arg.tables.edges +arg.tables.edges ``` Technically therefore, ARGs stored by `tskit` are edge-annotated @@ -425,6 +426,27 @@ which are never children of an edge are not visited by this algorithm. Such nodes are either {ref}`isolated` or a {ref}`root` in each local tree. +(sec_args_counting_unique_children)= + +### Counting unique children + +Because edges are ordered first by parent ID, then by child ID, it is efficient +to calculate the number of unique child IDs associated with a given parent. +Above we passed over the edges using a `groupby` operation, but the canonical +way to do this in the `numpy` library is to find where the group boundaries +are, and count those: + +```{code-cell} +def arg_num_children(ts): + # Return an array giving the number of unique child IDs for each node, anywhere in the genome. + # Relies on the fact that edges in a tree seq are sorted by parent ID then by child ID + same_parent = np.concatenate((ts.edges_parent[1:] == ts.edges_parent[:-1], [False])) + same_child = np.concatenate((ts.edges_child[1:] == ts.edges_child[:-1], [False])) + is_last_unique = ~same_parent | ~same_child # last occurrence of each unique child per parent + return np.bincount(ts.edges_parent[is_last_unique], minlength=ts.num_nodes) + +print(arg_num_children(arg)) +``` (sec_args_other_analysis)= @@ -458,26 +480,26 @@ def to_networkx_graph(ts, interval_lists=False): nx.set_node_attributes(G, {n.id: {'flags':n.flags, 'time': n.time} for n in ts.nodes()}) return G -arg = to_networkx_graph(ts_arg) +graph = to_networkx_graph(arg) ``` It is then possible to use the full range of networkx functions to analyse the graph: ```{code-cell} ipython3 -assert nx.is_directed_acyclic_graph(arg) # All ARGs should be DAGs -print("All descendants of node 10 are", nx.descendants(arg, 10)) +assert nx.is_directed_acyclic_graph(graph) # All ARGs should be DAGs +print("All descendants of node 10 are", nx.descendants(graph, 10)) ``` Networkx also has some built-in drawing functions: below is one of the simplest ones (for other possibilities, see the {ref}`sec_tskit_viz` tutorial). ```{code-cell} ipython3 -for layer, nodes in enumerate(nx.topological_generations(arg.reverse())): +for layer, nodes in enumerate(nx.topological_generations(graph.reverse())): for node in nodes: - arg.nodes[node]["layer"] = layer -pos = nx.multipartite_layout(arg, subset_key="layer", align='horizontal') + graph.nodes[node]["layer"] = layer +pos = nx.multipartite_layout(graph, subset_key="layer", align='horizontal') -nx.draw_networkx(arg, pos=pos) +nx.draw_networkx(graph, pos=pos) ``` ## Other software From 7c8d073a6b0ea3c03ab739fce7d7e1c2769d6b25 Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Sun, 26 Apr 2026 12:22:22 +0100 Subject: [PATCH 2/3] Simpify the viz examples Now that we have inherited_state --- viz.md | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/viz.md b/viz.md index 3ff3fc69..9d28d961 100644 --- a/viz.md +++ b/viz.md @@ -337,19 +337,17 @@ See {ref}`sec_tskit_viz_dynamic_effects` if you want to dynamically hide and sho labels. ```{code-cell} ipython3 -nd_labels = {} # An array of labels for the nodes -for n in ts_mutated.nodes(): +nd_labels = { # An array of labels for the nodes # Set sample node labels from metadata. Here we use the population name, but you might want # to use the *individual* name instead, if the individuals in your tree sequence have names - if n.is_sample(): - nd_labels[n.id] = ts_mutated.population(n.population).metadata["name"] + n.id: ts_mutated.population(n.population).metadata["name"] + for n in ts_mutated.nodes() + if n.is_sample() +} -mut_labels = {} # An array of labels for the mutations -for mut in ts_mutated.mutations(): # Make pretty labels showing the change in state - site = ts_mutated.site(mut.site) - older_mut = mut.parent >= 0 # is there an older mutation at the same position? - prev = ts_mutated.mutation(mut.parent).derived_state if older_mut else site.ancestral_state - mut_labels[mut.id] = f"{prev}→{mut.derived_state}" +mut_labels = { # An array of labels for the mutations + mut.id: f"{mut.inherited_state}→{mut.derived_state}" for mut in ts_mutated.mutations() +} ts_mutated.draw_svg( size=(1000, 300), @@ -370,12 +368,8 @@ knowledge of the SVG graphics language (see the {ref}`sec_tskit_viz_legend_examp later in this tutorial for a node colour legend). ```{code-cell} ipython3 -mut_labels = { # Alternative way of defining a label dictionary using a dict comprehension - mut.id: ( - (ts_mutated.mutation(mut.parent).derived_state if mut.parent < 0 else site.ancestral_state) + - str(int(site.position)) + - mut.derived_state - ) +mut_labels = { + mut.id: f"{mut.inherited_state}{int(site.position)}{mut.derived_state}" for site in ts_mutated.sites() for mut in site.mutations } From edc20483ca73314cc847dc2ab22dae7b80fe36fa Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Mon, 20 Apr 2026 21:47:16 +0100 Subject: [PATCH 3/3] Make a simplification tutorial --- _toc.yml | 2 + advanced_simplification.md | 249 +++++++++++++++++++++++++++ data/simplification_basic.trees | Bin 0 -> 8428 bytes simplification.md | 295 +++++++++++++++++++++++++++++++- 4 files changed, 540 insertions(+), 6 deletions(-) create mode 100644 advanced_simplification.md create mode 100644 data/simplification_basic.trees diff --git a/_toc.yml b/_toc.yml index a747a1aa..4d945aaa 100644 --- a/_toc.yml +++ b/_toc.yml @@ -19,6 +19,8 @@ parts: chapters: - file: tables_and_editing - file: simplification + sections: + - file: advanced_simplification - file: viz - file: metadata - file: args diff --git a/advanced_simplification.md b/advanced_simplification.md new file mode 100644 index 00000000..2d3f2061 --- /dev/null +++ b/advanced_simplification.md @@ -0,0 +1,249 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{currentmodule} tskit +``` + +(sec_advanced_simplification)= + +# _Advanced simplification_ +% remove underscores in title when tutorial is complete or near-complete + + +This is a companion to the basic {ref}`sec_simplification` tutorial. +It focuses on details of `simplify` behavior that are useful when you need precise +control over node IDs, retained ancestry structure, and sample flags. + +If you are primarily subsetting samples or reducing tree sequence size, start with +{ref}`sec_simplification`. + + +```{code-cell} ipython3 +:"tags": ["hide-input"] +# Build a full ARG for demonstrations +import msprime +import numpy as np +import tskit +import tskit_arg_visualizer as argviz + +arg = msprime.sim_ancestry( + samples=10, + sequence_length=1e4, + recombination_rate=1e-8, + population_size=1e4, + record_full_arg=True, + random_seed=123, +) +arg = msprime.sim_mutations(arg, rate=1e-8, random_seed=124) +``` + +:::{note} +You can both simplify an immutable {class}`TreeSequence` (to return a new one), +or run {meth}`~TableCollection.simplify` on a {class}`TableCollection` (which +modifies in place, and requires {ref}`sec_table_indexes` to be built as well as the +tables to be {meth}`sorted `). Simplifying tables in place +is often useful for {ref}`forward-time simulations `. +::: + +## 1) Tracking node ID changes + +With default settings, simplification compacts tables and therefore reassigns node +IDs. If downstream code makes use of stored node IDs, request a map: + +```{code-cell} ipython3 +focal = arg.samples()[-6:] # pick last 6 samples (3 diploids) +simp, node_map = arg.simplify(focal, map_nodes=True) + +print(f"Original ARG: {arg.num_individuals} individuals, simplified to {simp.num_individuals} individuals") +print(f"Original ARG: {arg.num_nodes} nodes, simplified to {simp.num_nodes} nodes") +print("Dropped nodes:", int(np.sum(node_map == tskit.NULL))) +print("Old sample ID", int(focal[0]), "maps to new ID", int(node_map[focal[0]])) +``` + +Note that when simplifying tables in-place using {meth}`TableCollection.simplify` a map +is always returned. To avoid compacting the node table, and leave node IDs unchanged, use +`filter_nodes=False`. + +### Obtaining the reverse map + +Often you might want a reverse map, mapping the new node IDs to the old ones. Here's +a simple way to do this: + +```{code-cell} ipython3 +def invert_map(node_mapping): + kept = node_mapping != tskit.NULL + indexes = node_mapping[kept] # indexes are guaranteed 0..N-1 + rev_map = np.full_like(indexes, tskit.NULL) + rev_map[indexes] = np.flatnonzero(kept) + return rev_map + +reverse_map = invert_map(node_map) +print("New sample ID 0", "maps to old ID", int(reverse_map[0])) +``` + +## 2) Keeping input roots + +:::{todo} +This is easy to illustrate, and useful for forward sims / census approaches +::: + +## 3) Setting sample flags + +Normally the nodes that are provided to the `simplify()` function are marked as sample +nodes in the output (by setting the `NODE_IS_SAMPLE` flag), and other nodes have that flag unset. +If you provide the `update_sample_flags=False` option, all node flags are left unchanged. +Here are some cases where that can be useful. + +### Parallel simplification + +One use for the `update_sample_flags=False` option combines it with `filter_nodes=False`, +to ensure that the node table remains untouched during simplification. +This is primarily a use-case targetted at developers of forward simulators, and allows +logically disjunct parts of the edge table to be simplified in parallel, without +risking two parallel processes trying to alter the same data. + +:::{todo} +Should we provide an example? However, this tends to be done at a lower level, e.g. +using the C API, and is likely to be only useful for developers. +::: + +### Retaining non-coalescent nodes + +The `keep_unary=True` option retains *all* ancestral nodes after simplification, even +if they don't represent coalescences. Sometimes you might want to retain *some* but not +all such nodes, which can be done by adding them to the focal node list but using +`update_sample_flags=False` to ensure they are not marked as samples in the final output. +This is usually followed by an additional simplification pass to remove any topology +above the additional nodes which is not ancestral to the true samples. Here are some examples: + +#### Keeping non-coalescent regions of coalescent nodes + +By default, `simplify()` deletes not just non-coalescent nodes, but also +removes from the ancestry those regions of coalescent nodes which do not represent +a coalescence in the local tree (i.e. are "locally unary"). We can identify coalescent +nodes using an initial round of simplification: + +```{code-cell} +simp, node_map = arg.simplify(focal, map_nodes=True) +keep_nodes = np.where(node_map != tskit.NULL)[0] #includes sample and coalescent nodes + +# Retain lots of nodes by treating them as focal, but don't change sample flags +tmp_ts = arg.simplify(keep_nodes, update_sample_flags=False) +# Now re-simplify, in case any coalescent nodes have unwanted ancestry above +part_simp_ts = tmp_ts.simplify(keep_unary=True) +``` + +Often, leaving in the non-coalescent regions of coalescent nodes can lead to a reduction +in the number of edges. This is one reason that the {meth}`TreeSequence.extend_haplotypes` exists +(see the documentation of that method for more detail). + +```{code-cell} +print(f"Full simplification to samples {focal} leads to {simp.num_edges} edges") +print( + f"Similar simplification leaving partially-coalescent nodes leads to {part_simp_ts.num_edges} edges" +) +part_simp_ts.draw_svg(title=( + f"ARG simplified to samples {focal}, leaving part-coalescent nodes: " + f"this has {part_simp_ts.num_edges} edges" +)) +``` + + +#### Subsetting a full ARG + +The {ref}`sec_args` tutorial discusses the idea of a "full ARG", containing recombination and +non-coalescent common ancestor nodes, which do not correspond to coalescent events anywhere in +the genealogy. We can use `simplify()` to reduce a full ARG to a subset of samples, while retaining those non-coalescent nodes. Again, we need to figure out +which nodes to keep, then add them to the focal nodes, simplifying with `update_sample_flags=False`. + +We can detect the common ancestor nodes by finding those which still have 2 unique children after +simplifying with `keep_unary=True` and the recombination nodes +by finding which still come in pairs after such simplification. + +```{code-cell} ipython3 +def arg_num_children(ts): # see the ARG tutorial which explains this code + same_parent = np.concatenate((ts.edges_parent[1:] == ts.edges_parent[:-1], [False])) + same_child = np.concatenate((ts.edges_child[1:] == ts.edges_child[:-1], [False])) + is_last_unique = ~same_parent | ~same_child # last occurrence of each unique child per parent + return np.bincount(ts.edges_parent[is_last_unique], minlength=ts.num_nodes) + +re_node_pairs = np.where(arg.nodes_flags & msprime.NODE_IS_RE_EVENT)[0].reshape(-1, 2) +print("RE nodes before simplification\n", re_node_pairs) +``` + +Identifying the _msprime_ recombination nodes that stay as pairs after simplification requires +a little work: + +:::{todo} +Currently the code below doesn't quite work, because `keep_unary` forces the nodes above the local +roots to be kept, see https://github.com/tskit-dev/tskit/issues/3450. This means that some RE +(and possibly CA) nodes are kept when they should be discarded. +::: + +```{code-cell} ipython3 +simp, node_map = arg.simplify(focal, keep_unary=True, map_nodes=True) +reverse_map = invert_map(node_map) +simp_re_nodes = reverse_map[np.where(simp.nodes_flags & msprime.NODE_IS_RE_EVENT)[0]] +valid_pairs = np.isin(re_node_pairs, simp_re_nodes).all(axis=1) +keep_re_nodes = re_node_pairs[valid_pairs, :] +print("paired RE nodes after simplification\n", keep_re_nodes) + +keep_RE_nodes = keep_re_nodes.flatten() +keep_CA_nodes = reverse_map[arg_num_children(simp) > 1] +``` + +Now that we have defined which nodes to keep, we can use the same trick as before, +passing these nodes as focal, but simplifying twice, once with `update_sample_flags=False` +then again with `keep_unary=True`: + +```{code-cell} ipython3 +keep = np.concatenate((focal, keep_CA_nodes, keep_RE_nodes)) + +tmp_arg = arg.simplify(keep, update_sample_flags=False) +subset_arg = tmp_arg.simplify(keep_unary=True) # Defaults to focal nodes = existing samples +``` + +Here's what it looks like in graph form: + +```{code-cell} ipython3 +d3arg = argviz.D3ARG.from_ts(ts=subset_arg) +d3arg.draw(title=f"A full ARG, subset to {subset_arg.num_samples} samples"); +``` + +## 4) Keeping individuals + +In some cases, a tree sequence might contain historical individuals which are associated +with nodes that are not samples, and you wish to retain information on individuals which are +ancestral to the sample nodes. For example a forward-time simulation could +define individuals for all nodes in the past, including the pedigree links between parents +and children (see also discussion in the SLiM manual, and at +https://github.com/MesserLab/SLiM/issues/139). + +To keep all the individuals associated with genetic ancestry, you can use +`keep_unary_in_individuals=True`. + +:::{todo} +Should we have a demonstration here? {ref}`sec_tskit_forward_simulations` could be used to +create a simulator that saves pedigree information into each individual, and we could distill +some of the discussion from https://github.com/MesserLab/SLiM/issues/139 into that. +::: + +## 5) reduce_to_site_topology + +:::{todo} +Explain why you might use `reduce_to_site_topology`, e.g. if there is a lot of information +that is not inferable from sites. Note that this does not change the topology at each site, +although some of the topology at a site might not actually be needed to represent the site +variation (to see how to condense topology into "polytomies", see +https://github.com/tskit-dev/tskit/discussions/2926). +::: \ No newline at end of file diff --git a/data/simplification_basic.trees b/data/simplification_basic.trees new file mode 100644 index 0000000000000000000000000000000000000000..059fd822709a0650450e9c6d0e7f629b73b86053 GIT binary patch literal 8428 zcmeHMU#J{s72m2gF($QI|0Ncgj(I3@_s{IzyEk`(vX?YaTSG&dT1#oCb7#N1cbfUL zelvTU+v}BpMQW|a$Gi(t=!1bGErO6DV!#LcVEbxN#6SzxCo9@N=ggen&fLjvb{eo$ zaA7m^opXNY|Mz`oZ+>|1>iq}y9olzOtycRwecv$XpZD?kn8;rGRQB>oS@%Diz1$+} zS&vT$|J`;Xwi zEci>3|BsdnR$9aUq0eLtdDLF1SU=cb5a2=44}4Yo$4mGt744&bj|v|51@QCo2fQzM z^v_kXUlsq|lKsC|#G`+nDdFF$h)4b}2p;$SUTGwpXQi@FmWcmnrTFizh)4gtQ1bt2 zH1fpK^cVR5vfyQGr~!|9Oh?yhu>V>Kf4QQ4#Q%CJe&FZj5B2-E;4%O1{bJ_vJUslr zSq^T@ALj&r$E?7Gj{F@IJm!z*1V68TVZT$tUljZ==GX`RKEX>GYLCf5aOvaMR{;MV z!Q=cej0{)x{CHCE=%1G0JJ&Z~gZ=Y@$Naic(LU<`yx>vqvw}x{X1M|WuS))(6a2jT z!T&#${C~Ql|FHkM;BkIl5d18^L5KZ+mHY?3YW;4J#z+00t7sqj|BB$@Kk)PNhd3<3 zWB@84_wp$4%AsP&lYSijFnD^YDapY^e&!O zLA2@k6OOm(4aSZiH`je<7?bFP;76{*N~Y1o^&$%~PNQ8%T;eUFvZ#0YA|%lRQ+SGw zz#Hn+ zu_Dd3ufk!n!8wbYsi{QM-bzb4l?Z$aKzx}bR#Qc!h?(Rlq9<)k_R1cIUJ{ST-XNWy z_Rv|x=QN*Pe4gVsG+*6(zVeyK=OF)Wi_g;o^!;D^a7_H=cn}lfLTnr#$Hrg2M{t+a z`se$!tbGvABtGLMjYKOIrzY7at`C3suIBs~YsEEUK4L>m#aQ^AV`Lu3&a#YQEZ3Xs z&N^8)<^g*BuYOjKN< zr`q?uIeM`F&iD5p-Tx=m-}mB+=5K!dY=7INbKce7h|dxyFcM5^u zLGRn9GZ>@?k>j7#(TGP56`Q?8kJWknUnG2oQX&1#4cI z-k~!D-J;}-!Q-?-=P5%CW-AR z^b5X+1#wz@Ze;Q0&`*cAbK_8u=d0$~<_}6i$p5 z_l62LJspK?lt|lVl=Vow71M-86!R`eKjG2lFiiHEDGxbeTtCsScN(c3W@+UsCr-KT zPNSQ3YoQx6CC0r749n^*wt7pJaq_OyM!VHsI%?=B zN_uC*aow^y)^fw@v>M%&?^(@L$G>@e_5S1b>Zudmj(z%wWwqIrz@Z;EVd{#bgKj6= z0b9ui9~Ts$#~se$rv1qA96OnD30!b_v0=3v79A)=_PsUj==9tm4QCw6vuwfwfTSDQ zeoODv)L4Pdn>b?+V{XIkqOo@4h`B?b4}SMgW@^qIj{($B+1Qx+X!Gu*JrO3h&AFoW zSo6<4n|jnvlOsFYQuS7s^S^tWRohQ9505iSjmYlQmS1dJ|LJL)~^Y!}<4KVmS z9PQz5z169=Tc@p+p4I7DD~*+<_TplDI;XR;WZ??u{~PLH!B3itkFDkeo}GARn)$gB x_EX-SKk4&LBfq;-QrlTs?k+74>fN^6st?w?Yjt`: +information that stays associated with tskit objects even when their IDs change. +When simplifying, it is also possible to keep track of node ID changes by using +the `map_nodes` parameter, as demonstrated later in this tutorial. +::: + +## A larger simplification example + +Here we examine the impact of simplification on the efficiency of tree sequence +storage and processing. We'll start with a larger backward simulation that has +a handful of admixed individuals: + +```{code-cell} ipython3 +demography = msprime.Demography() +demography.add_population(name="SMALL", initial_size=1000) +demography.add_population(name="BIG", initial_size=4000) +demography.add_population(name="ADMIX", initial_size=500) +demography.add_population(name="ANC", initial_size=1500) +demography.add_admixture( + time=100, derived="ADMIX", ancestral=["SMALL", "BIG"], proportions=[0.5, 0.5] +) +demography.add_population_split(time=1_000, derived=["SMALL", "BIG"], ancestral="ANC") + +big_ts = msprime.sim_ancestry( + samples={"SMALL": 400, "BIG": 400, "ADMIX": 6, "ANC": 400}, + demography=demography, + sequence_length=5e7, + recombination_rate=2e-8, + random_seed=2432, +) +big_ts = msprime.sim_mutations(big_ts, rate=1e-8, random_seed=6151) +print( + f"`big_ts` represents a simulation with admixture of {big_ts.num_samples} samples", + f"over {big_ts.sequence_length/1e6:g} Mb ({big_ts.num_trees} trees)", +) +``` + +## Use case 1: remove historical samples + +Here, about a third of the sample nodes (those from the `ANC` population) exist +at times prior to the current generation, i.e. they are *historical* sample nodes. +In fact, in forward simulations most nodes will be of this sort, left over from +previously simulated generations. These are often unwanted, and one of the main +use cases for simplification is to reduce the ancestry to that of just the +contemporary genomes: i.e. removing any edges, nodes, and mutations that track +"extinct" lineages. + +```{code-cell} ipython3 +modern_sample_ids = big_ts.samples(time=0) +ts_modern = big_ts.simplify(modern_sample_ids) +print(f"Tree sequence simplified to {ts_modern.nbytes/big_ts.nbytes:.1%} of original size") +``` + +Simplifying has only produced a modest reduction in size, but you can imagine that +in a forward simulation, where the majority of genomes are historical, repeated +simplification can result in huge savings. In practice, simulators usually do this +regular simplification of the tables used to store the paths of genetic inheritance +automatically, so using +{meth}`~TableCollection.simplify` in this way is mainly of interest if you are +{ref}`building your own forward simulator `. + +## Use case 2: simplify to a subset of samples + +Often you might want to focus on only one group of samples, for example the small +group of `ADMIX` individuals (population ID 2 in this simulation): + +```{code-cell} ipython3 +admix_pop_id = 2 +assert big_ts.population(admix_pop_id).metadata["name"] == "ADMIX" +admix_sample_ids = big_ts.samples(population=admix_pop_id) + +ts_admix, node_map = big_ts.simplify(admix_sample_ids, map_nodes=True) +print(f"Tree sequence simplified to {ts_admix.nbytes/big_ts.nbytes:.2%} of original size") +print() +print(f"Previous admixed sample IDs were {admix_sample_ids}") +print(f"Simplifying has changed these to {node_map[admix_sample_ids]}") + +# Check that these are indeed the only sample IDs +assert set(node_map[admix_sample_ids]) == set(ts_admix.samples()) +``` -:::{todo} -Create content. See https://github.com/tskit-dev/tutorials/issues/52 +:::{note} +The `map_nodes=True` argument means that `simplify()` returns both a new +tree sequence and an array mapping each old node ID to its new ID, or to +`tskit.NULL` if that node is removed. +Here you can see that (unlike in previous examples) the sample node IDs +have changed: unless `filter_nodes=False`, the _N_ node IDs provided as the `samples` +argument will be allocated new IDs from 0 to _N_ - 1 in the returned tree sequence (so simplify can be used to reorder sample IDs, although +{meth}`~TreeSequence.subset` is a way to do this with fewer side effects). ::: + +### Efficiency + +Edges take up the majority of the space in most tree sequences. In this case you can +see that although simplify has reduced the sample nodes to 12 genomes from +the 6 diploid `ADMIX` individuals (a reduction of 99.5%), the number of edges +has not been reduced by such a large amount. +That's because many of the ancestors of the SMALL and BIG populations are also shared +by `ADMIX`. It also shows why tree sequence structures are so effective for encoding +and analysing large datasets: storage and processing efficiency, in particular the +number of edges, is sub-linear in the number of samples. + +```{code-cell} ipython3 +print( + f"The simplified tree sequence has only {ts_admix.num_samples / big_ts.num_samples:.2%} of the samples,", + f"but retains {ts_admix.num_edges / big_ts.num_edges:.2%} of the edges." +) +``` + +If you want to analyse only the admixed individuals, using the simplified tree sequence +is much more efficient than running equivalent operations on the original `big_ts`: + +```{code-cell} ipython3 +%%timeit +# Speed test for decoding all the genetic variants of the admixed individuals +for v in ts_admix.variants(): + pass +``` + +Identical results can be obtained using the full tree sequence and restricting calculations to the `admix_sample_ids`, but this approach is much slower: + +```{code-cell} ipython3 +%%timeit +# Equivalent processing of admixed individuals, using the full tree sequence +for v in big_ts.variants(samples=admix_sample_ids): + pass +``` + +The same efficiencies apply to calculating statistics on subsets of genomic samples. +As simplification has been highly optimised in `tskit`, if you perform repeated +processing of the same subset of genomes, it can be worth simplifying before +processing. + +### Removing other unused objects + +If we print out the original and admix-only (simplified) tree sequence, we can see +that a number of other tables have also been reduced in size. For instance, +simplification has reduced the number of individuals from 1206 to 6, and the +number of sites to less than a sixth of the original. + +```{code-cell} ipython3 +print("Original tree sequence") +big_ts +``` + +```{code-cell} ipython3 +print("Simplified tree sequence") +ts_admix +``` + +Note that the call to {meth}`TreeSequence.simplify` has been recorded in the +{ref}`sec_provenance` information. Like most tree sequence methods, you can pass +`record_provenance=False` if you want this to be omitted (which will save space, but not +lead to other efficiency gains). + +On closer inspection, you might be surprised to see that there are still 4 populations in +the simplified tree sequence, although it contains only samples from the `ADMIX` population: + +```{code-cell} ipython3 +print( + "Sample nodes in `ts_admix` belong to the following populations", + ts_admix.tables.nodes.population[ts_admix.samples()], +) +ts_admix.tables.populations +``` + +The reason that the other populations (`BIG`, `SMALL`, and `ANC`) have been retained is +that the simulation has assigned populations to both sample and nonsample nodes. If we +{ref}`edit ` the tree sequence tables such that ancestral +(non-sample) genomes are not associated with defined populations, then simplification will +remove all but the admixed population (and reassign the population IDs as +necessary). + +An example of this is given in the code below, which performs a further round of simplification, +taking advantage of the fact that if a list of focal nodes is not given, `simplify` +uses the existing sample nodes. + +```{code-cell} ipython3 +import numpy as np +import tskit + +tables = ts_admix.dump_tables() +samples = ts_admix.samples() +# Make an array of NULL population values for each node +nodes_population = np.full_like(tables.nodes.population, tskit.NULL) +# Set the sample node populations back to their expected population +nodes_population[samples] = ts_admix.nodes_population[samples] +tables.nodes.population = nodes_population +tables.simplify() # This is the tables version of simplify, often used in forward sims +ts_admix_only = tables.tree_sequence() + +print( + "Sample nodes in `ts_admix_only` belong to the following populations", + ts_admix_only.tables.nodes.population[ts_admix_only.samples()], +) +ts_admix_only.tables.populations +``` + +Although reducing the number of populations saves space, it requires care. +For instance `admix_pop_id` can no longer be used to refer to the correct ID +in the `ts_admix_only` tree sequence. + +## Extra uses for simplification + +Simplify is somewhat of a "Swiss Army knife" for tree sequences, and can be used in +several other ways. See the {ref}`sec_advanced_simplification` tutorial for more details.