<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Popular on CuriousCoding</title><link>https://curiouscoding.nl/tags/popular/</link><description>Recent content in Popular on CuriousCoding</description><generator>Hugo</generator><language>en</language><lastBuildDate>Wed, 18 Dec 2024 00:00:00 +0100</lastBuildDate><atom:link href="https://curiouscoding.nl/tags/popular/index.xml" rel="self" type="application/rss+xml"/><item><title>Static search trees: 40x faster than binary search</title><link>https://curiouscoding.nl/posts/static-search-tree/</link><pubDate>Wed, 18 Dec 2024 00:00:00 +0100</pubDate><guid>https://curiouscoding.nl/posts/static-search-tree/</guid><description>&lt;div class="ox-hugo-toc toc has-section-numbers"&gt;
&lt;div class="heading"&gt;Table of Contents&lt;/div&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1&lt;/span&gt; &lt;a href="#introduction" &gt;Introduction&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.1&lt;/span&gt; &lt;a href="#problem-statement" &gt;Problem statement&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.2&lt;/span&gt; &lt;a href="#motivation" &gt;Motivation&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.3&lt;/span&gt; &lt;a href="#recommended-reading" &gt;Recommended reading&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.4&lt;/span&gt; &lt;a href="#binary-search-and-eytzinger-layout" &gt;Binary search and Eytzinger layout&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.5&lt;/span&gt; &lt;a href="#hugepages" &gt;Hugepages&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.6&lt;/span&gt; &lt;a href="#a-note-on-benchmarking" &gt;A note on benchmarking&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.7&lt;/span&gt; &lt;a href="#cache-lines" &gt;Cache lines&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;1.8&lt;/span&gt; &lt;a href="#s-trees-and-b-trees" &gt;S-trees and B-trees&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2&lt;/span&gt; &lt;a href="#optimizing-find" &gt;Optimizing &lt;code&gt;find&lt;/code&gt;&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.1&lt;/span&gt; &lt;a href="#linear" &gt;Linear&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.2&lt;/span&gt; &lt;a href="#auto-vectorization" &gt;Auto-vectorization&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.3&lt;/span&gt; &lt;a href="#trailing-zeros" &gt;Trailing zeros&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.4&lt;/span&gt; &lt;a href="#popcount" &gt;Popcount&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;2.5&lt;/span&gt; &lt;a href="#manual-simd" &gt;Manual SIMD&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3&lt;/span&gt; &lt;a href="#optimizing-the-search" &gt;Optimizing the search&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.1&lt;/span&gt; &lt;a href="#batching" &gt;Batching&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.2&lt;/span&gt; &lt;a href="#prefetching" &gt;Prefetching&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.3&lt;/span&gt; &lt;a href="#pointer-arithmetic" &gt;Pointer arithmetic&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.3.1&lt;/span&gt; &lt;a href="#up-front-splat" &gt;Up-front splat&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.3.2&lt;/span&gt; &lt;a href="#byte-based-pointers" &gt;Byte-based pointers&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.3.3&lt;/span&gt; &lt;a href="#the-final-version" &gt;The final version&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.4&lt;/span&gt; &lt;a href="#skip-prefetch" &gt;Skip prefetch&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;3.5&lt;/span&gt; &lt;a href="#interleave" &gt;Interleave&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;4&lt;/span&gt; &lt;a href="#optimizing-the-tree-layout" &gt;Optimizing the tree layout&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;4.1&lt;/span&gt; &lt;a href="#left-tree" &gt;Left-tree&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;4.2&lt;/span&gt; &lt;a href="#memory-layouts" &gt;Memory layouts&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;4.3&lt;/span&gt; &lt;a href="#node-size-b-15" &gt;Node size \(B=15\)&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;4.3.1&lt;/span&gt; &lt;a href="#data-structure-size" &gt;Data structure size&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;4.4&lt;/span&gt; &lt;a href="#summary" &gt;Summary&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5&lt;/span&gt; &lt;a href="#prefix-partitioning" &gt;Prefix partitioning&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5.1&lt;/span&gt; &lt;a href="#full-layout" &gt;Full layout&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5.2&lt;/span&gt; &lt;a href="#compact-subtrees" &gt;Compact subtrees&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5.3&lt;/span&gt; &lt;a href="#the-best-of-both-compact-first-level" &gt;The best of both: compact first level&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5.4&lt;/span&gt; &lt;a href="#overlapping-trees" &gt;Overlapping trees&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5.5&lt;/span&gt; &lt;a href="#human-data" &gt;Human data&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5.6&lt;/span&gt; &lt;a href="#prefix-map" &gt;Prefix map&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;5.7&lt;/span&gt; &lt;a href="#prefix-summary" &gt;Summary&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;6&lt;/span&gt; &lt;a href="#multi-threaded-comparison" &gt;Multi-threaded comparison&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7&lt;/span&gt; &lt;a href="#conclusion" &gt;Conclusion&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1&lt;/span&gt; &lt;a href="#future-work" &gt;Future work&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1.1&lt;/span&gt; &lt;a href="#branchy-search" &gt;Branchy search&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1.2&lt;/span&gt; &lt;a href="#interpolation-search" &gt;Interpolation search&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1.3&lt;/span&gt; &lt;a href="#packing-data-smaller" &gt;Packing data smaller&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1.4&lt;/span&gt; &lt;a href="#returning-indices-in-original-data" &gt;Returning indices in original data&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1.5&lt;/span&gt; &lt;a href="#range-queries" &gt;Range queries&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1.6&lt;/span&gt; &lt;a href="#sorting-queries" &gt;Sorting queries&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;span class="section-num"&gt;7.1.7&lt;/span&gt; &lt;a href="#suffix-array-searching" &gt;Suffix array searching&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;/div&gt;
&lt;!--endtoc--&gt;
&lt;p&gt;In this post, we will implement a static search tree (S+ tree) for
high-throughput searching of sorted data, as &lt;a href="https://en.algorithmica.org/hpc/data-structures/s-tree/" class="external-link" target="_blank" rel="noopener"&gt;introduced&lt;/a&gt; on Algorithmica.
We&amp;rsquo;ll mostly take the code presented there as a starting point, and optimize it
to its limits. For a large part, I&amp;rsquo;m simply taking the &amp;lsquo;future work&amp;rsquo; ideas of that post
and implementing them. And then there will be a bunch of looking at assembly
code to shave off all the instructions we can.
Lastly, there will be one big addition to optimize throughput: &lt;em&gt;batching&lt;/em&gt;.&lt;/p&gt;</description></item><item><title>One Billion Row Challenge</title><link>https://curiouscoding.nl/posts/1brc/</link><pubDate>Wed, 03 Jan 2024 00:00:00 +0100</pubDate><guid>https://curiouscoding.nl/posts/1brc/</guid><description>&lt;div class="ox-hugo-toc toc"&gt;
&lt;div class="heading"&gt;Table of Contents&lt;/div&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="#external-links" &gt;External links&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#the-problem" &gt;The problem&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#initial-solution-105s" &gt;Initial solution: 105s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#first-flamegraph" &gt;First flamegraph&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#bytes-instead-of-strings-72s" &gt;Bytes instead of strings: 72s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#manual-parsing-61s" &gt;Manual parsing: 61s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#inline-hash-keys-50s" &gt;Inline hash keys: 50s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#faster-hash-function-41s" &gt;Faster hash function: 41s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#a-new-flame-graph" &gt;A new flame graph&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#perf-it-is" &gt;Perf it is&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#something-simple-allocating-the-right-size-41s" &gt;Something simple: allocating the right size: 41s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#memchr-for-scanning-47s" &gt;&lt;code&gt;memchr&lt;/code&gt; for scanning: 47s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#memchr-crate-29s" &gt;&lt;code&gt;memchr&lt;/code&gt; crate: 29s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#get-unchecked-28s" &gt;&lt;code&gt;get_unchecked&lt;/code&gt;: 28s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#manual-simd-29s" &gt;Manual SIMD: 29s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#profiling" &gt;Profiling&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#revisiting-the-key-function-23s" &gt;Revisiting the key function: 23s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#ptrhash-perfect-hash-function-17s" &gt;PtrHash perfect hash function: 17s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#larger-masks-15s" &gt;Larger masks: 15s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#reduce-pattern-matching-14s" &gt;Reduce pattern matching: 14s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#memory-map-12s" &gt;Memory map: 12s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#parallelization-2-dot-0s" &gt;Parallelization: 2.0s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#branchless-parsing-1-dot-7s" &gt;Branchless parsing: 1.7s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#purging-all-branches-1-dot-67s" &gt;Purging all branches: 1.67s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#some-more-attempts" &gt;Some more attempts&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#faster-perfect-hashing-1-dot-55s" &gt;Faster perfect hashing: 1.55s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#bug-time-back-up-to-1-dot-71s" &gt;Bug time: Back up to 1.71s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#temperatures-less-than-100-1-dot-62s" &gt;Temperatures less than 100: 1.62s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#computing-min-as-a-max-1-dot-50" &gt;Computing &lt;code&gt;min&lt;/code&gt; as a &lt;code&gt;max&lt;/code&gt;: 1.50&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#intermezzo-hyperthreading-1-dot-34s" &gt;Intermezzo: Hyperthreading: 1.34s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#not-parsing-negative-numbers-1-dot-48s" &gt;Not parsing negative numbers: 1.48s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#more-efficient-parsing-1-dot-44s" &gt;More efficient parsing: 1.44s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#fixing-undefined-behaviour-back-to-1-dot-56s" &gt;Fixing undefined behaviour: back to 1.56s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#lazily-subtracting-b-0-1-dot-52s" &gt;Lazily subtracting &lt;code&gt;b'0'&lt;/code&gt;: 1.52s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#min-max-without-parsing-1-dot-55s" &gt;Min/max without parsing: 1.55s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#parsing-using-a-single-multiplication-doesn-t-work" &gt;Parsing using a single multiplication: doesn&amp;rsquo;t work&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#parsing-using-a-single-multiplication-does-work-after-all-1-dot-48s" &gt;Parsing using a single multiplication does work after all! 1.48s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#a-side-note-ascii" &gt;A side note: ASCII&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#skip-parsing-using-pdep-1-dot-42s" &gt;Skip parsing using &lt;code&gt;PDEP&lt;/code&gt;: 1.42s&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a href="#improved" &gt;Improved&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#a-further-note" &gt;A further note&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a href="#branchy-min-max-1-dot-37s" &gt;Branchy min/max: 1.37s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#no-counting-1-dot-34s" &gt;No counting: 1.34s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#arbitrary-long-city-names-1-dot-34" &gt;Arbitrary long city names: 1.34&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#4-entries-in-parallel-1-dot-23s" &gt;4 entries in parallel: 1.23s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#mmap-per-thread" &gt;Mmap per thread&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#reordering-some-operations-1-dot-19s" &gt;Reordering some operations: 1.19s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#reordering-more-1-dot-11s" &gt;Reordering more: 1.11s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#even-more-ilp-1-dot-05" &gt;Even more ILP: 1.05&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#compliance-1-ok-i-ll-count-1-dot-06" &gt;Compliance 1, OK I&amp;rsquo;ll count: 1.06&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#d41d8c" &gt;&lt;span class="org-todo todo TODO"&gt;TODO&lt;/span&gt; &lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="#postscript" &gt;Postscript&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/div&gt;
&lt;!--endtoc--&gt;
&lt;p&gt;A youtube video on this post is &lt;a href="https://youtu.be/e_9ziFKcEhw?si=JHy4aVliKw9gfryf&amp;amp;t=896" class="external-link" target="_blank" rel="noopener"&gt;here&lt;/a&gt;.&lt;/p&gt;</description></item></channel></rss>