<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Results on CuriousCoding</title><link>https://curiouscoding.nl/categories/results/</link><description>Recent content in Results on CuriousCoding</description><generator>Hugo</generator><language>en</language><lastBuildDate>Tue, 12 Aug 2025 00:00:00 +0200</lastBuildDate><atom:link href="https://curiouscoding.nl/categories/results/index.xml" rel="self" type="application/rss+xml"/><item><title>SimdSketch: a fast bucket sketch</title><link>https://curiouscoding.nl/posts/simd-sketch/</link><pubDate>Sun, 09 Mar 2025 00:00:00 +0100</pubDate><guid>https://curiouscoding.nl/posts/simd-sketch/</guid><description>&lt;div class="ox-hugo-toc toc has-section-numbers"&gt;
&lt;div class="heading"&gt;Table of Contents&lt;/div&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1&lt;/span&gt; &lt;a href="#jaccard-similarity" &gt;Jaccard similarity&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2&lt;/span&gt; &lt;a href="#hash-schemes" &gt;Hash schemes&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.1&lt;/span&gt; &lt;a href="#minhash" &gt;MinHash&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.2&lt;/span&gt; &lt;a href="#s-mins-sketch" &gt;$s$-mins sketch&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.3&lt;/span&gt; &lt;a href="#bottom-s" &gt;Bottom-\(s\) sketch&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.4&lt;/span&gt; &lt;a href="#fracminhash" &gt;FracMinHash&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.5&lt;/span&gt; &lt;a href="#bucket-sketch" &gt;Bucket sketch&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.6&lt;/span&gt; &lt;a href="#mod-bucket-hash--new" &gt;Mod-bucket hash (new?)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.7&lt;/span&gt; &lt;a href="#variants" &gt;Variants&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3&lt;/span&gt; &lt;a href="#compressing-sketches" &gt;Compressing sketches&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.1&lt;/span&gt; &lt;a href="#b-bit-hashing" &gt;$b$-bit hashing&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.1.1&lt;/span&gt; &lt;a href="#accounting-for-collisions" &gt;Accounting for collisions&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.2&lt;/span&gt; &lt;a href="#hyperminhash" &gt;HyperMinHash&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;4&lt;/span&gt; &lt;a href="#densification-strategies" &gt;Densification strategies&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5&lt;/span&gt; &lt;a href="#simdsketch" &gt;SimdSketch&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;6&lt;/span&gt; &lt;a href="#evaluation" &gt;Evaluation&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;6.1&lt;/span&gt; &lt;a href="#setup" &gt;Setup&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;6.1.1&lt;/span&gt; &lt;a href="#tools" &gt;Tools&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;6.1.2&lt;/span&gt; &lt;a href="#inputs" &gt;Inputs&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;6.1.3&lt;/span&gt; &lt;a href="#parameters" &gt;Parameters&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;6.1.4&lt;/span&gt; &lt;a href="#metrics" &gt;Metrics&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;6.2&lt;/span&gt; &lt;a href="#raw-results" &gt;Raw results&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;6.3&lt;/span&gt; &lt;a href="#correlation" &gt;Correlation&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;6.4&lt;/span&gt; &lt;a href="#comparison-speed" &gt;Comparison speed&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;6.5&lt;/span&gt; &lt;a href="#low-similarity-data" &gt;Low-similarity data&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7&lt;/span&gt; &lt;a href="#discussion" &gt;Discussion&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;8&lt;/span&gt; &lt;a href="#future-work" &gt;&lt;span class="org-todo todo TODO"&gt;TODO&lt;/span&gt; / Future work&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/div&gt;
&lt;!--endtoc--&gt;
&lt;p&gt;\[
\newcommand{\sketch}{\mathsf{sketch}}
\]&lt;/p&gt;</description></item><item><title>Static search trees: 40x faster than binary search</title><link>https://curiouscoding.nl/posts/static-search-tree/</link><pubDate>Wed, 18 Dec 2024 00:00:00 +0100</pubDate><guid>https://curiouscoding.nl/posts/static-search-tree/</guid><description>&lt;div class="ox-hugo-toc toc has-section-numbers"&gt;
&lt;div class="heading"&gt;Table of Contents&lt;/div&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1&lt;/span&gt; &lt;a href="#introduction" &gt;Introduction&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.1&lt;/span&gt; &lt;a href="#problem-statement" &gt;Problem statement&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.2&lt;/span&gt; &lt;a href="#motivation" &gt;Motivation&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.3&lt;/span&gt; &lt;a href="#recommended-reading" &gt;Recommended reading&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.4&lt;/span&gt; &lt;a href="#binary-search-and-eytzinger-layout" &gt;Binary search and Eytzinger layout&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.5&lt;/span&gt; &lt;a href="#hugepages" &gt;Hugepages&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.6&lt;/span&gt; &lt;a href="#a-note-on-benchmarking" &gt;A note on benchmarking&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.7&lt;/span&gt; &lt;a href="#cache-lines" &gt;Cache lines&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.8&lt;/span&gt; &lt;a href="#s-trees-and-b-trees" &gt;S-trees and B-trees&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2&lt;/span&gt; &lt;a href="#optimizing-find" &gt;Optimizing &lt;code&gt;find&lt;/code&gt;&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.1&lt;/span&gt; &lt;a href="#linear" &gt;Linear&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.2&lt;/span&gt; &lt;a href="#auto-vectorization" &gt;Auto-vectorization&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.3&lt;/span&gt; &lt;a href="#trailing-zeros" &gt;Trailing zeros&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.4&lt;/span&gt; &lt;a href="#popcount" &gt;Popcount&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.5&lt;/span&gt; &lt;a href="#manual-simd" &gt;Manual SIMD&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3&lt;/span&gt; &lt;a href="#optimizing-the-search" &gt;Optimizing the search&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.1&lt;/span&gt; &lt;a href="#batching" &gt;Batching&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.2&lt;/span&gt; &lt;a href="#prefetching" &gt;Prefetching&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.3&lt;/span&gt; &lt;a href="#pointer-arithmetic" &gt;Pointer arithmetic&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.3.1&lt;/span&gt; &lt;a href="#up-front-splat" &gt;Up-front splat&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.3.2&lt;/span&gt; &lt;a href="#byte-based-pointers" &gt;Byte-based pointers&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.3.3&lt;/span&gt; &lt;a href="#the-final-version" &gt;The final version&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.4&lt;/span&gt; &lt;a href="#skip-prefetch" &gt;Skip prefetch&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.5&lt;/span&gt; &lt;a href="#interleave" &gt;Interleave&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;4&lt;/span&gt; &lt;a href="#optimizing-the-tree-layout" &gt;Optimizing the tree layout&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;4.1&lt;/span&gt; &lt;a href="#left-tree" &gt;Left-tree&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;4.2&lt;/span&gt; &lt;a href="#memory-layouts" &gt;Memory layouts&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;4.3&lt;/span&gt; &lt;a href="#node-size-b-15" &gt;Node size \(B=15\)&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;4.3.1&lt;/span&gt; &lt;a href="#data-structure-size" &gt;Data structure size&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;4.4&lt;/span&gt; &lt;a href="#summary" &gt;Summary&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5&lt;/span&gt; &lt;a href="#prefix-partitioning" &gt;Prefix partitioning&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5.1&lt;/span&gt; &lt;a href="#full-layout" &gt;Full layout&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5.2&lt;/span&gt; &lt;a href="#compact-subtrees" &gt;Compact subtrees&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5.3&lt;/span&gt; &lt;a href="#the-best-of-both-compact-first-level" &gt;The best of both: compact first level&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5.4&lt;/span&gt; &lt;a href="#overlapping-trees" &gt;Overlapping trees&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5.5&lt;/span&gt; &lt;a href="#human-data" &gt;Human data&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5.6&lt;/span&gt; &lt;a href="#prefix-map" &gt;Prefix map&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5.7&lt;/span&gt; &lt;a href="#prefix-summary" &gt;Summary&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;6&lt;/span&gt; &lt;a href="#multi-threaded-comparison" &gt;Multi-threaded comparison&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7&lt;/span&gt; &lt;a href="#conclusion" &gt;Conclusion&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1&lt;/span&gt; &lt;a href="#future-work" &gt;Future work&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1.1&lt;/span&gt; &lt;a href="#branchy-search" &gt;Branchy search&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1.2&lt;/span&gt; &lt;a href="#interpolation-search" &gt;Interpolation search&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1.3&lt;/span&gt; &lt;a href="#packing-data-smaller" &gt;Packing data smaller&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1.4&lt;/span&gt; &lt;a href="#returning-indices-in-original-data" &gt;Returning indices in original data&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1.5&lt;/span&gt; &lt;a href="#range-queries" &gt;Range queries&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1.6&lt;/span&gt; &lt;a href="#sorting-queries" &gt;Sorting queries&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1.7&lt;/span&gt; &lt;a href="#suffix-array-searching" &gt;Suffix array searching&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;/div&gt;
&lt;!--endtoc--&gt;
&lt;p&gt;In this post, we will implement a static search tree (S+ tree) for
high-throughput searching of sorted data, as &lt;a href="https://en.algorithmica.org/hpc/data-structures/s-tree/" class="external-link" target="_blank" rel="noopener"&gt;introduced&lt;/a&gt; on Algorithmica.
We&amp;rsquo;ll mostly take the code presented there as a starting point, and optimize it
to its limits. For a large part, I&amp;rsquo;m simply taking the &amp;lsquo;future work&amp;rsquo; ideas of that post
and implementing them. And then there will be a bunch of looking at assembly
code to shave off all the instructions we can.
Lastly, there will be one big addition to optimize throughput: &lt;em&gt;batching&lt;/em&gt;.&lt;/p&gt;</description></item><item><title>A lemma on suffix array searching</title><link>https://curiouscoding.nl/posts/suffix-array-searching-lemma/</link><pubDate>Sat, 05 Oct 2024 00:00:00 +0200</pubDate><guid>https://curiouscoding.nl/posts/suffix-array-searching-lemma/</guid><description>&lt;div class="ox-hugo-toc toc has-section-numbers"&gt;
&lt;div class="heading"&gt;Table of Contents&lt;/div&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1&lt;/span&gt; &lt;a href="#suffix-arrays" &gt;Suffix arrays&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2&lt;/span&gt; &lt;a href="#searching-methods" &gt;Searching methods&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.1&lt;/span&gt; &lt;a href="#naive-o--p-cdot-lg-2-n--search" &gt;Naive \(O(|P|\cdot \lg_2 n)\) search&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.2&lt;/span&gt; &lt;a href="#faster-search" &gt;Faster \(O(|P|\cdot \lg_2 n)\) search&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.3&lt;/span&gt; &lt;a href="#lcp-based-o--p-plus-lg-2-n--search" &gt;LCP-based \(O(|P| + \lg_2 n)\) search&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3&lt;/span&gt; &lt;a href="#analysing-the-faster-search" &gt;Analysing the faster search&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/div&gt;
&lt;!--endtoc--&gt;
&lt;p&gt;We&amp;rsquo;ll prove that using the &amp;ldquo;faster&amp;rdquo; binary search algorithm (see &lt;a href="#faster-search" &gt;2.2&lt;/a&gt;) that tracks the LCP
with the left and right boundary of the remaining search interval has amortized
runtime&lt;/p&gt;
&lt;p&gt;\[
O\Big(\lg_2(n) + |P| + |P| \cdot \lg_2(Occ(P))\Big),
\]
when \(P\) is a randomly sampled fixed-length pattern from the text and \(Occ(P)\) counts the number of occurrences of \(P\) in the text.&lt;/p&gt;</description></item><item><title>Perfect NtHash for Robust Minimizers</title><link>https://curiouscoding.nl/posts/nthash/</link><pubDate>Sun, 31 Dec 2023 00:00:00 +0100</pubDate><guid>https://curiouscoding.nl/posts/nthash/</guid><description>&lt;div class="ox-hugo-toc toc"&gt;
&lt;div class="heading"&gt;Table of Contents&lt;/div&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="#nthash" &gt;NtHash&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#minimizers" &gt;Minimizers&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="#robust-minimizers" &gt;Robust minimizers&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a href="#is-nthash-injective-on-kmers" &gt;Is NtHash injective on kmers?&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="#searching-for-a-collision" &gt;Searching for a collision&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#proving-perfection" &gt;Proving perfection&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a href="#alternatives" &gt;Alternatives&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#smhasher-results" &gt;SmHasher results&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/div&gt;
&lt;!--endtoc--&gt;
&lt;h2 id="nthash"&gt;
 NtHash
 &lt;a class="heading-link" href="#nthash"&gt;
 &lt;i class="fa-solid fa-link" aria-hidden="true" title="Link to heading"&gt;&lt;/i&gt;
 &lt;span class="sr-only"&gt;Link to heading&lt;/span&gt;
 &lt;/a&gt;
&lt;/h2&gt;
&lt;p&gt;NtHash (&lt;a href="#citeproc_bib_item_3"&gt;Mohamadi et al. 2016&lt;/a&gt;) is a rolling hash suitable for hashing any kind of text, but made for DNA originally.
For a string of length \(k\) it is a \(64\) bit value computed as:&lt;/p&gt;</description></item><item><title>Tensor embedding preserves Hamming distance</title><link>https://curiouscoding.nl/posts/tensor-embedding-distance/</link><pubDate>Fri, 14 Oct 2022 00:00:00 +0200</pubDate><guid>https://curiouscoding.nl/posts/tensor-embedding-distance/</guid><description>&lt;div class="ox-hugo-toc toc"&gt;
&lt;div class="heading"&gt;Table of Contents&lt;/div&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="#definitions" &gt;Definitions&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#proof-of-lemma-1" &gt;Proof of Lemma 1&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#proof-of-lemma-2" &gt;&lt;span class="org-todo todo TODO"&gt;TODO&lt;/span&gt; Proof of Lemma 2&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/div&gt;
&lt;!--endtoc--&gt;
&lt;p&gt;This is a proof that Tensor Embedding
(&lt;a href="#citeproc_bib_item_3"&gt;Joudaki, Rätsch, and Kahles 2020&lt;/a&gt;) with $ℓ^2$-norm preserves the Hamming distance.&lt;/p&gt;
&lt;p&gt;This is in collaboration with Amir Joudaki.&lt;/p&gt;
&lt;p&gt;\begin{equation*}
\newcommand{\I}{\mathcal I}
\newcommand{\EE}{\mathbb E}
\newcommand{\var}{\operatorname{Var}}
\end{equation*}&lt;/p&gt;
&lt;h2 id="definitions"&gt;
 Definitions
 &lt;a class="heading-link" href="#definitions"&gt;
 &lt;i class="fa-solid fa-link" aria-hidden="true" title="Link to heading"&gt;&lt;/i&gt;
 &lt;span class="sr-only"&gt;Link to heading&lt;/span&gt;
 &lt;/a&gt;
&lt;/h2&gt;
&lt;dl&gt;
&lt;dt&gt;Notation&lt;/dt&gt;
&lt;dd&gt;&lt;ul&gt;
&lt;li&gt;The alphabet is \(\Sigma\), of size \(|\Sigma| = \sigma\).&lt;/li&gt;
&lt;li&gt;The set of indices is \(\I := \{(i_1, \dots, i_t) \in [n]^t: i_1 &amp;lt; \dots &amp;lt; i_t\}\).&lt;/li&gt;
&lt;li&gt;Given a string \(a_1\dots a_n = a\in \Sigma^n\), we define the &lt;em&gt;$I$-index&lt;/em&gt; as
\(a_I = (a_{i_1}, \dots, a_{i_t})\).&lt;/li&gt;
&lt;li&gt;We write \([ X ]\) for the indicator variable of event \(X\), which is \(1\) when
\(X\) holds and \(0\) otherwise.&lt;/li&gt;
&lt;/ul&gt;
&lt;/dd&gt;
&lt;dt&gt;Definition 1: Tensor embedding&lt;/dt&gt;
&lt;dd&gt;Given \(a\in \Sigma^n\), the &lt;em&gt;tensor embedding&lt;/em&gt; \(T_a\) is the \(\sigma^t\) tensor
given by \(T_a[s] = \sum_{I\in \I} [A_I = s]\) for each \(s\in \Sigma^t\).
&lt;p&gt;The &lt;em&gt;normalized tensor embedding distance&lt;/em&gt; \(d_{te}\) between two sequences \(a\)
and \(b\) is defined as&lt;/p&gt;</description></item><item><title>28000x speedup with Numba.CUDA</title><link>https://curiouscoding.nl/posts/numba-cuda-speedup/</link><pubDate>Mon, 24 May 2021 00:00:00 +0200</pubDate><guid>https://curiouscoding.nl/posts/numba-cuda-speedup/</guid><description>&lt;div class="ox-hugo-toc toc"&gt;
&lt;div class="heading"&gt;Table of Contents&lt;/div&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="#cuda-overview" &gt;CUDA Overview&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="#profiling" &gt;Profiling&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a href="#optimizing-tensor-sketch" &gt;Optimizing Tensor Sketch&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="#cpu-code" &gt;CPU code&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="#v0-original-python-code" &gt;V0: Original python code&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#v1-numba" &gt;V1: Numba&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#v2-multithreading" &gt;V2: Multithreading&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a href="#gpu-code" &gt;GPU code&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="#v3-a-first-gpu-version" &gt;V3: A first GPU version&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#v4-parallel-kernel-invocations" &gt;V4: Parallel kernel invocations&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#v5-single-kernel-with-many-blocks" &gt;V5: Single kernel with many blocks&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#v6-detailed-profiling-kernel-compute" &gt;V6: Detailed profiling: Kernel Compute&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#v7-detailed-profiling-kernel-latency" &gt;V7: Detailed profiling: Kernel Latency&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#v8-detailed-profiling-shared-memory-access-pattern" &gt;V8: Detailed profiling: Shared Memory Access Pattern&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#v9-more-work-per-thread" &gt;V9: More work per thread&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#v10-cache-seq-to-shared-memory" &gt;V10: Cache seq to shared memory&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#v11-hashes-and-signs-in-shared-memory" &gt;V11: Hashes and signs in shared memory&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#v12-revisiting-blocks-per-kernel" &gt;V12: Revisiting blocks per kernel&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#v13-passing-a-tuple-of-sequences" &gt;V13: Passing a tuple of sequences&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#v14-better-hardware" &gt;V14: Better hardware&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#v15-dynamic-shared-memory" &gt;V15: Dynamic shared memory&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a href="#wrap-up" &gt;Wrap up&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/div&gt;
&lt;!--endtoc--&gt;
&lt;p&gt;&lt;strong&gt;Backlinks&lt;/strong&gt;: &lt;a href="https://www.reddit.com/r/CUDA/comments/mq1yrm/28000x_speedup_with_numbacuda/" class="external-link" target="_blank" rel="noopener"&gt;r/CUDA&lt;/a&gt;, &lt;a href="https://numba.discourse.group/t/blog-28000x-speedup-with-numba-cuda/667" class="external-link" target="_blank" rel="noopener"&gt;Numba discourse&lt;/a&gt;&lt;/p&gt;</description></item></channel></rss>