<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>Will.Whim</title>
	<atom:link href="http://willwhim.wordpress.com/feed/" rel="self" type="application/rss+xml" />
	<link>http://willwhim.wordpress.com</link>
	<description>A weblog by Will Fitzgerald</description>
	<lastBuildDate>Sun, 15 Jan 2012 00:51:08 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='willwhim.wordpress.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://s2.wp.com/i/buttonw-com.png</url>
		<title>Will.Whim</title>
		<link>http://willwhim.wordpress.com</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://willwhim.wordpress.com/osd.xml" title="Will.Whim" />
	<atom:link rel='hub' href='http://willwhim.wordpress.com/?pushpress=hub'/>
		<item>
		<title>The Non-Chaos, or English Spelling Defended in Rhyme</title>
		<link>http://willwhim.wordpress.com/2012/01/13/the-non-chaos-or-english-spelling-defended-in-rhyme/</link>
		<comments>http://willwhim.wordpress.com/2012/01/13/the-non-chaos-or-english-spelling-defended-in-rhyme/#comments</comments>
		<pubDate>Fri, 13 Jan 2012 05:01:41 +0000</pubDate>
		<dc:creator>Will Fitzgerald</dc:creator>
				<category><![CDATA[Language]]></category>
		<category><![CDATA[Whim]]></category>

		<guid isPermaLink="false">http://willwhim.wordpress.com/?p=1159</guid>
		<description><![CDATA[Dearest creature in creation, Study English pronunciation. It’s more regular in its core Than pundits, who focus on its more Erratic ways, would have you believe. Perhaps they simply cannot conceive Of any system not based in Latin— They would choose, I suppose, to flatten All writing to “one form, one sound” But, really, regularities [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1159&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Dearest creature in creation,<br />
Study English pronunciation.<br />
It’s more regular in its core<br />
Than pundits, who focus on its more<br />
Erratic ways, would have you believe.<br />
Perhaps they simply cannot conceive<br />
Of any system not based in Latin—<br />
They would choose, I suppose, to flatten<br />
All writing to “one form, one sound”<br />
But, really, regularities abound.<br />
Consider, how we pronounce the plural<br />
Form of words; Imagine the neural<br />
Work of reading “dogs” and “cats.”<br />
Would you prefer “dogz”? That’s<br />
Not right—that single ess for each<br />
Is easier to read, to sound out, and to teach.<br />
Or consider “heir/inherit”<br />
To write “air” would be a demerit,<br />
A signature failure, and a sign<br />
Of a spelling system’s worse design.<br />
Seriously, it would simply astonish,<br />
Anyone to think that “ghoti” sounds like “fish.”<br />
Besides, English spans such colossal ages<br />
And latitudes, I doubt such cages<br />
Desired by fans of regularization<br />
Could withstand the normal mutation<br />
Of how language really adapts.<br />
“Wind” and “hind” have rhymed or not, perhaps,<br />
As, over time and place, each has adopted<br />
A short I, sometimes a long I, co-opted<br />
By real human beings. So “after tea and cakes and ices, “<br />
Let us “force the moment to its crisis&#8221;—<br />
Haters, they say, are going to hate; let them snivel<br />
I have had enough of drivel,<br />
Go ahead, enjoy your whine,<br />
But English spelling is basically fine.</p>
<p>—Will Fitzgerald, January 2012</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/willwhim.wordpress.com/1159/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/willwhim.wordpress.com/1159/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/willwhim.wordpress.com/1159/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/willwhim.wordpress.com/1159/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/willwhim.wordpress.com/1159/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/willwhim.wordpress.com/1159/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/willwhim.wordpress.com/1159/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/willwhim.wordpress.com/1159/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/willwhim.wordpress.com/1159/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/willwhim.wordpress.com/1159/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/willwhim.wordpress.com/1159/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/willwhim.wordpress.com/1159/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/willwhim.wordpress.com/1159/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/willwhim.wordpress.com/1159/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1159&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://willwhim.wordpress.com/2012/01/13/the-non-chaos-or-english-spelling-defended-in-rhyme/feed/</wfw:commentRss>
		<slash:comments>4</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/4b85e6b127c527c8dcebe18d1c985e48?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">asimpledesire</media:title>
		</media:content>
	</item>
		<item>
		<title>Distribution of tweet lengths</title>
		<link>http://willwhim.wordpress.com/2012/01/07/distribution-of-tweet-lengths/</link>
		<comments>http://willwhim.wordpress.com/2012/01/07/distribution-of-tweet-lengths/#comments</comments>
		<pubDate>Sat, 07 Jan 2012 23:47:38 +0000</pubDate>
		<dc:creator>Will Fitzgerald</dc:creator>
				<category><![CDATA[Language]]></category>

		<guid isPermaLink="false">http://willwhim.wordpress.com/?p=1152</guid>
		<description><![CDATA[I get a very different distribution of tweets than Isaac Hepworth — no spikes at 28. My provisional guess is that his data is a bit wonky. My data here is (only) 50k English tweets from one day in 2007.<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1152&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<div id="attachment_1153" class="wp-caption alignnone" style="width: 310px"><a href="http://willwhim.files.wordpress.com/2012/01/tweet_length.png"><img class="size-medium wp-image-1153" title="Tweet lengths" src="http://willwhim.files.wordpress.com/2012/01/tweet_length.png?w=300&#038;h=75" alt="" width="300" height="75" /></a><p class="wp-caption-text">% of English tweets by size (sample 50k)</p></div>
<p>I get a very different distribution of tweets than <a href="https://twitter.com/#!/isaach/status/155437871149481984/photo/1">Isaac Hepworth</a> — no spikes at 28. My provisional guess is that his data is a bit wonky. My data here is (only) 50k English tweets from one day in 2007.</p>
<div id="attachment_1154" class="wp-caption alignnone" style="width: 310px"><a href="http://willwhim.files.wordpress.com/2012/01/aig565bcaaaygkb.png"><img class="size-medium wp-image-1154" title="Isaac " src="http://willwhim.files.wordpress.com/2012/01/aig565bcaaaygkb.png?w=300&#038;h=208" alt="" width="300" height="208" /></a><p class="wp-caption-text">Isaac Hepworth&#039;s distribution</p></div>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/willwhim.wordpress.com/1152/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/willwhim.wordpress.com/1152/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/willwhim.wordpress.com/1152/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/willwhim.wordpress.com/1152/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/willwhim.wordpress.com/1152/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/willwhim.wordpress.com/1152/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/willwhim.wordpress.com/1152/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/willwhim.wordpress.com/1152/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/willwhim.wordpress.com/1152/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/willwhim.wordpress.com/1152/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/willwhim.wordpress.com/1152/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/willwhim.wordpress.com/1152/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/willwhim.wordpress.com/1152/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/willwhim.wordpress.com/1152/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1152&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://willwhim.wordpress.com/2012/01/07/distribution-of-tweet-lengths/feed/</wfw:commentRss>
		<slash:comments>3</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/4b85e6b127c527c8dcebe18d1c985e48?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">asimpledesire</media:title>
		</media:content>

		<media:content url="http://willwhim.files.wordpress.com/2012/01/tweet_length.png?w=300" medium="image">
			<media:title type="html">Tweet lengths</media:title>
		</media:content>

		<media:content url="http://willwhim.files.wordpress.com/2012/01/aig565bcaaaygkb.png?w=300" medium="image">
			<media:title type="html">Isaac </media:title>
		</media:content>
	</item>
		<item>
		<title>2011 in review</title>
		<link>http://willwhim.wordpress.com/2012/01/01/2011-in-review/</link>
		<comments>http://willwhim.wordpress.com/2012/01/01/2011-in-review/#comments</comments>
		<pubDate>Sun, 01 Jan 2012 00:09:53 +0000</pubDate>
		<dc:creator>Will Fitzgerald</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://willwhim.wordpress.com/?p=1150</guid>
		<description><![CDATA[The WordPress.com stats helper monkeys prepared a 2011 annual report for this blog. Here&#8217;s an excerpt: A New York City subway train holds 1,200 people. This blog was viewed about 6,000 times in 2011. If it were a NYC subway train, it would take about 5 trips to carry that many people. Click here to [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1150&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>The WordPress.com stats helper monkeys prepared a 2011 annual report for this blog.</p>
<div style="background:url('/wp-content/mu-plugins/annual-reports/img/emailteaser.jpg') no-repeat center center;height:300px;"></div>
<p>Here&#8217;s an excerpt:</p>
<blockquote><p>A New York City subway train holds 1,200 people. This blog was viewed about <strong>6,000</strong> times in 2011. If it were a NYC subway train, it would take about 5 trips to carry that many people.</p></blockquote>
<p><a href="/2011/annual-report/">Click here to see the complete report.</a></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/willwhim.wordpress.com/1150/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/willwhim.wordpress.com/1150/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/willwhim.wordpress.com/1150/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/willwhim.wordpress.com/1150/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/willwhim.wordpress.com/1150/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/willwhim.wordpress.com/1150/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/willwhim.wordpress.com/1150/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/willwhim.wordpress.com/1150/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/willwhim.wordpress.com/1150/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/willwhim.wordpress.com/1150/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/willwhim.wordpress.com/1150/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/willwhim.wordpress.com/1150/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/willwhim.wordpress.com/1150/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/willwhim.wordpress.com/1150/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1150&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://willwhim.wordpress.com/2012/01/01/2011-in-review/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/4b85e6b127c527c8dcebe18d1c985e48?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">asimpledesire</media:title>
		</media:content>
	</item>
		<item>
		<title>Useful Scala introduction</title>
		<link>http://willwhim.wordpress.com/2011/12/23/useful-scala-introduction/</link>
		<comments>http://willwhim.wordpress.com/2011/12/23/useful-scala-introduction/#comments</comments>
		<pubDate>Fri, 23 Dec 2011 20:41:49 +0000</pubDate>
		<dc:creator>Will Fitzgerald</dc:creator>
				<category><![CDATA[Science and Tech]]></category>
		<category><![CDATA[Scala]]></category>

		<guid isPermaLink="false">http://willwhim.wordpress.com/?p=1142</guid>
		<description><![CDATA[I found the following Scala introduction, by Jason Baldridge, pretty useful, especially as it has a mild computational linguistics focus. It assumes no programming background, but that doesn&#8217;t get it the way too much. Part 1: the Scala REPL, expressions, variables, basic types, simple functions, saving and running programs, comments Part 2: Tuples, Lists, methods on Lists [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1142&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<div>I found the following Scala introduction, by <a href="http://bcomposes.wordpress.com/">Jason Baldridge</a>, pretty useful, especially as it has a mild computational linguistics focus. It assumes no programming background, but that doesn&#8217;t get it the way too much.</div>
<div></div>
<ul>
<li>Part 1: <a href="http://bcomposes.wordpress.com/2011/08/22/first-steps-in-scala-for-first-time-programmers-part-1/">the Scala REPL, expressions, variables, basic types, simple functions, saving and running programs, comments</a></li>
<li>Part 2: <a href="http://bcomposes.wordpress.com/2011/08/24/first-steps-in-scala-for-beginning-programmers-part-2/">Tuples, Lists, methods on Lists and Strings</a></li>
<li>Part 3: <a href="http://bcomposes.wordpress.com/2011/08/26/first-steps-in-scala-for-beginning-programmers-part-3/">Conditional execution with if-else blocks and matching</a></li>
<li>Part 4: <a href="http://bcomposes.wordpress.com/2011/08/30/first-steps-in-scala-for-beginning-programmers-part-4/">Iterating, mapping, filtering and counting</a></li>
<li>Part 5: <a href="https://bcomposes.wordpress.com/2011/09/04/first-steps-in-scala-for-beginning-programmers-part-5/">Regular expressions and matching with them</a></li>
<li>Part 6: <a href="http://bcomposes.wordpress.com/2011/09/06/first-steps-in-scala-for-beginning-programmers-part-6/">Regular expression matching and substitution with the Regex API</a></li>
<li>Part 7: <a href="http://bcomposes.wordpress.com/2011/09/12/first-steps-in-scala-for-beginning-programmers-part-7/">Maps, Sets, groupBy, Options, flatten, flatMap</a></li>
<li>Part 8: <a href="http://bcomposes.wordpress.com/2011/09/19/first-steps-in-scala-for-beginning-programmers-part-8/">Word counting, scala.io.Source, file access, flatMap, mutable Maps</a></li>
<li>Part 9: <a href="http://bcomposes.wordpress.com/2011/10/24/first-steps-in-scala-for-beginning-programmers-part-9/">Objects, classes, inheritance, traits, Lists with multiple related types, apply</a></li>
<li>Part 10: <a href="http://bcomposes.wordpress.com/2011/10/25/first-steps-in-scala-for-beginning-programmers-part-10/">Scripting, compiling, main methods, return values of functions</a></li>
<li>Part 11: <a href="http://bcomposes.wordpress.com/2011/10/26/first-steps-in-scala-for-beginning-programmers-part-11/">SBT, scalabha, packages, build systems</a></li>
<li>Part 12: <a href="http://bcomposes.wordpress.com/2011/11/14/first-steps-in-scala-for-beginning-programmers-part-12/">Code blocks, coding style, closures, scala documentation project</a></li>
</ul>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/willwhim.wordpress.com/1142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/willwhim.wordpress.com/1142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/willwhim.wordpress.com/1142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/willwhim.wordpress.com/1142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/willwhim.wordpress.com/1142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/willwhim.wordpress.com/1142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/willwhim.wordpress.com/1142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/willwhim.wordpress.com/1142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/willwhim.wordpress.com/1142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/willwhim.wordpress.com/1142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/willwhim.wordpress.com/1142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/willwhim.wordpress.com/1142/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/willwhim.wordpress.com/1142/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/willwhim.wordpress.com/1142/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1142&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://willwhim.wordpress.com/2011/12/23/useful-scala-introduction/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/4b85e6b127c527c8dcebe18d1c985e48?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">asimpledesire</media:title>
		</media:content>
	</item>
		<item>
		<title>Computational Social Science on the cheap using Twitter</title>
		<link>http://willwhim.wordpress.com/2011/11/23/computational-social-science-on-the-cheap-using-twitter/</link>
		<comments>http://willwhim.wordpress.com/2011/11/23/computational-social-science-on-the-cheap-using-twitter/#comments</comments>
		<pubDate>Wed, 23 Nov 2011 00:37:37 +0000</pubDate>
		<dc:creator>Will Fitzgerald</dc:creator>
				<category><![CDATA[Language]]></category>
		<category><![CDATA[Science and Tech]]></category>

		<guid isPermaLink="false">http://willwhim.wordpress.com/?p=1136</guid>
		<description><![CDATA[This is a followup to my post Computational lexicography on the cheap using Twitter, but more especially in response to Using off-the-shelf software for basic Twitter analysis. The later article shows how to use database software (MySQL and its implementation of the SQL language) to do basic Twitter analysis. The &#8216;basic analysis&#8217; includes counts by hashtag, [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1136&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>This is a followup to my post <a href="http://willwhim.wordpress.com/2010/10/11/computational-lexicography-on-the-cheap-using-twitter/">Computational lexicography on the cheap using Twitter</a>, but more especially in response to <a href="http://socialmediacollective.org/2011/10/06/using-off-the-shelf-software-for-basic-twitter-analysis/">Using off-the-shelf software for basic Twitter analysis</a>.</p>
<p>The later article shows how to use database software (MySQL and its implementation of the SQL language) to do basic Twitter analysis. The &#8216;basic analysis&#8217; includes counts by hashtag, timelines, and word clouds. They analyse about 475k tweets.</p>
<p>But here&#8217;s the thing: all their analyses can be done more simply with simple text files and pipes of Unix commands (as most eloquently demonstarted in <a href="http://www.stanford.edu/class/cs224n/handouts/kwc-unix-for-poets.pdf">Unix for Poets,</a> by Ken Church). In fact, several simple   commands—commands I use everyday—are powerful enough to do the kind of analyses they discuss.</p>
<p><strong>Getting the data.</strong></p>
<p>(You can skip over this if you have data already!)</p>
<p>Interestingly, they do <em>not</em> show how to get the tweets to begin with. My <a href="http://willwhim.wordpress.com/2010/10/11/computational-lexicography-on-the-cheap-using-twitter/">previous post</a> discusses this, but it might be useful to show a simple Ruby program that collects Tweet data, especially since the method has changed slightly since my post. The biggest hurdle is setting up authentication to access Twitter&#8217;s data—discussed in full, <a href="https://dev.twitter.com/docs/auth/oauth-landing">here</a>, but the crucial thing is that you have to register as a Twitter developer, register a Twitter application, and get special tokens. You create an application at the Twitter <a href="https://dev.twitter.com/apps">apps</a> page; from that same location you generate the special tokens.</p>
<p>Here&#8217;s the Ruby script (also listed <a title="Gist at Github" href="https://gist.github.com/1387532" target="_blank">here</a>).</p>
<pre>require 'rubygems'
require 'tweetstream'
require 'date'

TweetStream.configure do |config|
  config.consumer_key = ''
  config.consumer_secret = ''
  config.oauth_token = ''
  config.oauth_token_secret = ''
  config.auth_method = :oauth
  config.parser   = :json_gem
end

# Change the words you want to track
TweetStream::Client.new.track('football', 'baseball', 'soccer', 'cricket') do |status|
  begin
    # The Tweet id
    id = status.id
    # The text of the tweet, with new lines (returns) replaced by spaces
    txt = status.text.gsub(/\n/," ")
    # The date of the tweet, printed out in a slightly more useful form
    # for our purposes
    d = DateTime.parse(status.created_at).strftime("%Y-%m-%d\t%H:%M:%S")
    puts [id,txt,d].join("\t")
  rescue Exception =&gt; e
    puts "!!! Error: #{e.to_s}"
  end
end</pre>
<p>With the proper keys and secrets, this gist wlll allow you to track keywords over time, and print out, in a tab-separated format, the tweet id, the text of the tweet, the date, andthe time it was published (in UTC, or Greenwich, time). You could add additional columns, as described (by example) in the Twitter API.</p>
<p>The example here tracks mentions of football, baseball, soccer, and cricket, but obviously, these could be other keywords. Running this using this command:</p>
<pre>ruby track_tweets.rb | tee nsports.tsv</pre>
<p>will place tweets in the file &#8216;nsports.tsv&#8217;.</p>
<p><strong>Basic statistics</strong></p>
<p>Counting the number of football, baseball, etc. mentions is easy:</p>
<pre>$ grep -i football nsports.tsv | wc -l
$ grep -i baseball nsports.tsv | wc -l
$ grep -i soccer nsports.tsv | wc -l
$ grep -i cricket nsports.tsv | wc -l</pre>
<p>As well as getting the number of lines in the file:</p>
<pre>$ cat nsports.tsv | wc -l</pre>
<p>The second analysis was to count who is retweeted the most, done by counting the username after the  standard Twitter &#8220;RT &#8221; (eg &#8220;rt @willf good stuff!&#8221;). The following pipeline of commands accomplishes this simply enough:</p>
<pre>egrep -io "rt +@\w+" nsports.tsv | perl -pe "s/ +/ /g" | cut -f2 -d\  | sort | uniq -c | sort -rn | head</pre>
<p>(This may be easier to copy from <a href="https://gist.github.com/1387515" target="_blank">here</a>). Each of these is a separate command, and the pipe symbol (|), indicates that the output from one command goes on to the next. Here&#8217;s what these commands do:</p>
<ol>
<li>egrep -io &#8220;rt +@\w+&#8221; nsports.tsv &#8212; searches through the tweets for the pattern RT space @ name, where there is one or more spaces, and one or more &#8216;word&#8217; characters. It only prints the matching parts (-o), and ignores differences in case (-i).</li>
<li>perl -pe &#8220;s/ +/ /g&#8221; &#8212; I noticed that from time to time, there is more than one space after the &#8216;RT&#8217;, so this substitutes one or more spaces with exactly one space.</li>
<li>cut -f2 -d\  &#8211; Each line looks like &#8220;RT @name&#8221;, now, and this command &#8216;cuts&#8217; the second field out of each line, with a delimiter of a space. This results in each line looking like &#8216;@name&#8217;.</li>
<li>sort | uniq -c | sort -rn &#8212; this is three commands, but I type them so frequently, it seems like one to me. It sorts the text, so they can be counted with the uniq command, which produces two columns : the count and the name; we reverse sort (-r) on the first numeric field (-n)</li>
<li>head &#8212; this shows the top ten lines from a file.</li>
</ol>
<p>This command pipeline should have no problem handling 475k lines.</p>
<p>The third analysis was to put the data in a format that can be used by Excel to create a graph, with counts by day. Because we have printed the date and time in separate columns, with the date in column 3. So, we can simply do the cut, sort, uniq series:</p>
<pre>cat nsports.tsv | cut -f3 | sort | uniq -c &gt; for_excel.tsv</pre>
<p>This will put the data into a format that Excel can read.</p>
<p>Finally, the authors show how to create Wordle word graphs overall, and for the categories. I&#8217;m not a big fan of these as a data exploration tool, but notice you can use cut -f2 to get the text to paste into Wordle.</p>
<p>So, this is computational social science on the cheap using Twitter, using some basic Unix commands (cat, cut, sort, uniq, grep), with one tiny, tiny call to Perl. You can do this too&#8211;and it&#8217;s easier to learn than MySQL and SQL! Plus, you can easily read the text files that are created. All of this was done on a standard Mac, but any Unix machine, or Windows machine with the <a href="http://cygwin.com/" target="_blank">Cygwin</a> tools installed, can do this as well.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/willwhim.wordpress.com/1136/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/willwhim.wordpress.com/1136/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/willwhim.wordpress.com/1136/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/willwhim.wordpress.com/1136/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/willwhim.wordpress.com/1136/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/willwhim.wordpress.com/1136/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/willwhim.wordpress.com/1136/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/willwhim.wordpress.com/1136/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/willwhim.wordpress.com/1136/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/willwhim.wordpress.com/1136/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/willwhim.wordpress.com/1136/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/willwhim.wordpress.com/1136/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/willwhim.wordpress.com/1136/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/willwhim.wordpress.com/1136/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1136&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://willwhim.wordpress.com/2011/11/23/computational-social-science-on-the-cheap-using-twitter/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/4b85e6b127c527c8dcebe18d1c985e48?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">asimpledesire</media:title>
		</media:content>
	</item>
		<item>
		<title>Leroy Herron</title>
		<link>http://willwhim.wordpress.com/2011/11/06/leroy-herron/</link>
		<comments>http://willwhim.wordpress.com/2011/11/06/leroy-herron/#comments</comments>
		<pubDate>Sun, 06 Nov 2011 06:10:19 +0000</pubDate>
		<dc:creator>Will Fitzgerald</dc:creator>
				<category><![CDATA[Personal and family]]></category>

		<guid isPermaLink="false">http://willwhim.wordpress.com/?p=1131</guid>
		<description><![CDATA[When I was in junior high school at Burton Junior High School &#8212; that is, grades seven and eight &#8212; Mr Leroy Herron was a very important man in my life. He was a school counselor, and a coach for the basketball team. He was also the sponsor of the Human Relations Club, a club [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1131&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>When I was in junior high school at Burton Junior High School &#8212; that is, grades seven and eight &#8212; Mr Leroy Herron was a very important man in my life. He was a school counselor, and a coach for the basketball team. He was also the sponsor of the Human Relations Club, a club created to get black kids and white kids like me to learn more about what I now might call anti-racism, but then we mostly called non-discrimination. If I remember correctly, there were two white boys &#8212; Alan Kulevicz and me, and about a half dozen black girls. The school itself had a strong majority of white kids. I remember Mr Herron talking about how his son self-identified as &#8220;black,&#8221; while Mr Herron felt more comfortable, at that time, calling himself a Negro. If I recall correctly, African American, or Afro-American were also coming into vogue. </p>
<p>We once did a field trip to a school in Detroit where the students were all (or almost all) African American. I remember asking the principal how many of his staff were black, and how many were white. He had to stop and think, and he said that he didn&#8217;t primarily think of the teachers in racial terms. Since knowing whether someone was black or white was very important in my family, this came as a shock, and a new way of thinking. </p>
<p>Mr Herron loved sports, and he loved coaching. I wish I had been a decent ball player, but instead I just acted as the team&#8217;s manager. I don&#8217;t remember much about this experience, except I was at one point asked to keep score for the number of times players in the game showed &#8220;hustle,&#8221; and I had no idea how to do this, so I got razzed about it. I really was not a good manager &#8212; not as bad as I was a baseball umpire, but that&#8217;s another story. </p>
<p>One time, I left school crying. I don&#8217;t know why now &#8212; I was probably being bullied for being smart and weak and unpopular in some way. We lived about a mile away from the school, and I usually walked. And Mr Herron left the school looking for me, and drove until he found me. I think that I refused his help then, but his act of looking out for me is something I remember forty years later.</p>
<p>The Macomb Daily (the local county paper) <a href="http://www.macombdaily.com/articles/2009/02/10/news/srv0000004672834.txt?viewmode=default">reported</a> back in February of 2009 that Mr Herron died in a house fire at the age of 75. My youngest brother Steve mentioned this to me over the phone. Mr Herron eventually became an assistant superintendent of the Roseville schools. I assume that he brought his love for students, for sports, and for racial equality to that job as well. </p>
<p>I never caught his love for sports, but he began to open my eyes to the experiences of African Americans, and he began to turn me into a man, for which I will always be grateful. </p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/willwhim.wordpress.com/1131/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/willwhim.wordpress.com/1131/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/willwhim.wordpress.com/1131/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/willwhim.wordpress.com/1131/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/willwhim.wordpress.com/1131/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/willwhim.wordpress.com/1131/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/willwhim.wordpress.com/1131/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/willwhim.wordpress.com/1131/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/willwhim.wordpress.com/1131/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/willwhim.wordpress.com/1131/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/willwhim.wordpress.com/1131/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/willwhim.wordpress.com/1131/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/willwhim.wordpress.com/1131/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/willwhim.wordpress.com/1131/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1131&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://willwhim.wordpress.com/2011/11/06/leroy-herron/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/4b85e6b127c527c8dcebe18d1c985e48?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">asimpledesire</media:title>
		</media:content>
	</item>
		<item>
		<title>I am a Wordnik</title>
		<link>http://willwhim.wordpress.com/2011/11/04/i-am-a-wordnik/</link>
		<comments>http://willwhim.wordpress.com/2011/11/04/i-am-a-wordnik/#comments</comments>
		<pubDate>Fri, 04 Nov 2011 04:14:28 +0000</pubDate>
		<dc:creator>Will Fitzgerald</dc:creator>
				<category><![CDATA[Personal and family]]></category>

		<guid isPermaLink="false">http://willwhim.wordpress.com/?p=1124</guid>
		<description><![CDATA[This week, I started as the lead engineer for Wordnik&#8216;s analytics platform. Except I get a little antsy about the term &#8220;engineer,&#8221; so I asked them to make my title &#8220;Lead, Analytics Platform.&#8221; It&#8217;s a real pleasure to work with the Wordnik team so far&#8211;super excited to be working with Tony Tam and Erin McKean, [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1124&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>This week, I started as the lead engineer for <a href="http://wordnik.com">Wordnik</a>&#8216;s analytics platform. Except I get a little antsy about the term &#8220;engineer,&#8221; so I asked them to make my title &#8220;Lead, Analytics Platform.&#8221; It&#8217;s a real pleasure to work with the Wordnik team so far&#8211;super excited to be working with Tony Tam and Erin McKean, and also former Powersetters Colin Pollack and Robert Voyer. When Robert joined Wordnik over a year ago, I badgered him into getting me an interview&#8211;it&#8217;s only now that it&#8217;s come to fruition.</p>
<p>There were many good things about working at Bing and Microsoft, especially the large amounts of friendship I found there, and the large amounts of data I got to explore and understand. Still, it was a real joy to fire up a terminal session and start exercising my atrophied Unix muscles.</p>
<p>I&#8217;ll be spending most of my time in Silicon Valley/San Francisco with visits back to Michigan from time to time.</p>
<p>Let me end by pointing to Erin&#8217;s inspiring TED talk, which was the starting point of my path to Wordnik.</p>
<object width="446" height="326"><param name="movie" value="http://video.ted.com/assets/player/swf/EmbedPlayer.swf"></param><param name="allowFullScreen" value="true" /><param name="wmode" value="transparent"></param><param name="bgColor" value="#ffffff"></param> <param name="flashvars" value="vu=http://video.ted.com/talks/embed/ErinMcKean_2007-embed_high.flv&su=http://images.ted.com/images/ted/tedindex/embed-posters/ErinMcKean-2007.embed_thumbnail.jpg&vw=432&vh=240&ap=0&ti=161" /><embed src="http://video.ted.com/assets/player/swf/EmbedPlayer.swf" pluginspace="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash" wmode="transparent" bgColor="#ffffff" width="446" height="326" allowFullScreen="true" flashvars="vu=http://video.ted.com/talks/embed/ErinMcKean_2007-embed_high.flv&su=http://images.ted.com/images/ted/tedindex/embed-posters/ErinMcKean-2007.embed_thumbnail.jpg&vw=432&vh=240&ap=0&ti=161"></embed></object>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/willwhim.wordpress.com/1124/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/willwhim.wordpress.com/1124/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/willwhim.wordpress.com/1124/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/willwhim.wordpress.com/1124/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/willwhim.wordpress.com/1124/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/willwhim.wordpress.com/1124/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/willwhim.wordpress.com/1124/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/willwhim.wordpress.com/1124/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/willwhim.wordpress.com/1124/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/willwhim.wordpress.com/1124/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/willwhim.wordpress.com/1124/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/willwhim.wordpress.com/1124/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/willwhim.wordpress.com/1124/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/willwhim.wordpress.com/1124/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1124&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://willwhim.wordpress.com/2011/11/04/i-am-a-wordnik/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/4b85e6b127c527c8dcebe18d1c985e48?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">asimpledesire</media:title>
		</media:content>
	</item>
		<item>
		<title>Remembering Miss Mullens</title>
		<link>http://willwhim.wordpress.com/2011/10/07/remembering-miss-mullens/</link>
		<comments>http://willwhim.wordpress.com/2011/10/07/remembering-miss-mullens/#comments</comments>
		<pubDate>Fri, 07 Oct 2011 15:33:31 +0000</pubDate>
		<dc:creator>Will Fitzgerald</dc:creator>
				<category><![CDATA[Personal and family]]></category>

		<guid isPermaLink="false">http://willwhim.wordpress.com/?p=1120</guid>
		<description><![CDATA[It&#8217;s Ada Lovelace Day and we are encouraged to write about women who were significant in getting us involved in Science and Technology. I remember my Junior High Geometry teacher, Miss Mullens. She was very, very short, kind of shy, but very funny &#8212; a classic geek, really, now that I think of it (geek, [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1120&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>It&#8217;s <a href="http://findingada.com/">Ada Lovelace Day</a> and we are encouraged to write about women who were significant in getting us involved in Science and Technology.</p>
<p>I remember my Junior High Geometry teacher, Miss Mullens. She was very, very short, kind of shy, but very funny &#8212; a classic geek, really, now that I think of it (geek, of course, being a term of praise here, not a negative thing).</p>
<p>Junior High school geometry, for me, was mostly about learning to do proofs &#8212; classic Elements of Euclid stuff. This was in the heyday of the &#8220;new math&#8221; movement, and I think &#8212; although this is a long time ago &#8212; that they emphasized thought processes over rote memorization. And I loved doing proofs, getting them right. I know I got a &#8220;A+&#8221; in the class. It was a great encouragement to me. I must have had this class in grade 9, because I didn&#8217;t do well in Algebra (8th grade) until the teacher &#8212; Mr Perkins &#8212; called me out on my laziness. Mr Perkins actually had a paddle and used it on students (this was in the late 60s). Miss Mullens was too small for that of course &#8212; but I would have done anything for her; her praise was enough.</p>
<p>Alas, I doubt if she&#8217;ll read these words &#8212; perhaps, even, her name was Mullins, or Mullen. It&#8217;s been a long time. But she was a great teacher, and I remember her fondly.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/willwhim.wordpress.com/1120/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/willwhim.wordpress.com/1120/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/willwhim.wordpress.com/1120/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/willwhim.wordpress.com/1120/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/willwhim.wordpress.com/1120/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/willwhim.wordpress.com/1120/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/willwhim.wordpress.com/1120/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/willwhim.wordpress.com/1120/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/willwhim.wordpress.com/1120/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/willwhim.wordpress.com/1120/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/willwhim.wordpress.com/1120/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/willwhim.wordpress.com/1120/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/willwhim.wordpress.com/1120/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/willwhim.wordpress.com/1120/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1120&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://willwhim.wordpress.com/2011/10/07/remembering-miss-mullens/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/4b85e6b127c527c8dcebe18d1c985e48?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">asimpledesire</media:title>
		</media:content>
	</item>
		<item>
		<title>Producing n hash functions by hashing only once</title>
		<link>http://willwhim.wordpress.com/2011/09/03/producing-n-hash-functions-by-hashing-only-once/</link>
		<comments>http://willwhim.wordpress.com/2011/09/03/producing-n-hash-functions-by-hashing-only-once/#comments</comments>
		<pubDate>Sat, 03 Sep 2011 06:52:38 +0000</pubDate>
		<dc:creator>Will Fitzgerald</dc:creator>
				<category><![CDATA[Go]]></category>
		<category><![CDATA[Science and Tech]]></category>

		<guid isPermaLink="false">http://willwhim.wordpress.com/?p=1115</guid>
		<description><![CDATA[To implement a bloom filter, you need a bunch of hash functions. In naive implementations (and I&#8217;ve seen plenty), programmers pick out, say, five cryptographic hash functions. One problem with this is that the hash functions for bloom filters have different requirements than hash functions for cryptography&#8211;the latter tend to be more than is required [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1115&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>To implement a bloom filter, you need a bunch of hash functions. In naive implementations (and I&#8217;ve seen plenty), programmers pick out, say, five cryptographic hash functions. One problem with this is that the hash functions for bloom filters have different requirements than hash functions for cryptography&#8211;the latter tend to be more than is required for the former. What you want for bloom filters is something that&#8217;s very, very fast, while maintaining that basic desiderata for bloom filter hashing, uniform spread.</p>
<p>There&#8217;s a <a href="http://bit.ly/rgYdK3">good paper</a> that reminds us that you can easily simulate <em>n</em> hash functions by having just <em>two</em> hash functions around. This can be as simple as this function to create the <em>i</em>th hash of a key, given the results <em>a</em> and <em>b</em> of hashing a key with these two functions:</p>
<blockquote><p>hash(i) = (a + b * i ) % m</p></blockquote>
<p>where <em>m</em> is the maximum value of the hash (for example, the number of buckets in a bloom filter).</p>
<p><strong>But here&#8217;s a good trick not really worth a paper&#8211;but it&#8217;s still a good trick. Typically, it&#8217;s totally reasonable to limit the size to under the maximum size of an unsigned 32-bit number. These days, at least, it&#8217;s probably <em>cheaper</em> to calculate a base hash function on unsigned 64-bit numbers. So, you can take the upper half and the lower half of the 64-bit hashed value and return them as <em>two</em> 32 bit numbers.</strong></p>
<p>Voila! Double hashing with one hash function. And using <a href="http://en.wikipedia.org/wiki/Fowler-Noll-Vo_hash_function">FNV</a> means you have a very cheap hash function to start with, so really this can be very, very fast.</p>
<p>I implemented this in my <a href="https://github.com/willf/bloom">bloom filter code for go</a>.</p>
<p>For <a href="https://twitter.com/#!/mojombo/status/109838512337059840">@mojombo</a>.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/willwhim.wordpress.com/1115/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/willwhim.wordpress.com/1115/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/willwhim.wordpress.com/1115/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/willwhim.wordpress.com/1115/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/willwhim.wordpress.com/1115/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/willwhim.wordpress.com/1115/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/willwhim.wordpress.com/1115/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/willwhim.wordpress.com/1115/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/willwhim.wordpress.com/1115/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/willwhim.wordpress.com/1115/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/willwhim.wordpress.com/1115/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/willwhim.wordpress.com/1115/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/willwhim.wordpress.com/1115/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/willwhim.wordpress.com/1115/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1115&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://willwhim.wordpress.com/2011/09/03/producing-n-hash-functions-by-hashing-only-once/feed/</wfw:commentRss>
		<slash:comments>5</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/4b85e6b127c527c8dcebe18d1c985e48?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">asimpledesire</media:title>
		</media:content>
	</item>
		<item>
		<title>On &#8220;Retiring a Great Interview Problem&#8221;</title>
		<link>http://willwhim.wordpress.com/2011/08/09/on-retiring-a-great-interview-problem/</link>
		<comments>http://willwhim.wordpress.com/2011/08/09/on-retiring-a-great-interview-problem/#comments</comments>
		<pubDate>Tue, 09 Aug 2011 08:48:59 +0000</pubDate>
		<dc:creator>Will Fitzgerald</dc:creator>
				<category><![CDATA[Science and Tech]]></category>

		<guid isPermaLink="false">http://willwhim.wordpress.com/?p=1111</guid>
		<description><![CDATA[Daniel Tunkaleng wrote an interesting blog post, &#8220;Retiring a Great Interview Problem&#8221; on an interview problem that he has, in the past, posed to interviewees, but which he has now retired, because someone posted the problem, and a solution, to an interview problem website. The problem is the segmentation problem (which he calls the word-break problem): [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1111&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>Daniel Tunkaleng wrote an interesting blog post, &#8220;<a href="http://thenoisychannel.com/2011/08/08/retiring-a-great-interview-problem/">Retiring a Great Interview Problem</a>&#8221; on an interview problem that he has, in the past, posed to interviewees, but which he has now retired, because someone posted the problem, and a solution, to an interview problem website. The problem is the <em>segmentation problem</em> (which he calls the <em>word-break problem</em>): given a string, segment it into reasonable words, if possible (for example, breaking &#8220;applepie&#8221; into &#8220;apple pie&#8221;). He describes the various gradations of good answers to this problem, and I guess I would say that his reasoning is sound: candidates who give a better answer than a worse answer would probably make better programmers.</p>
<p>However, if I were asked this question in an interview, I am fairly confident that I would freeze up and not give a good answer. Partially, this is because of the stupidly artificial conversational model that occurs when one is participating in a white-board exercise&#8211;what, program without the resources of the internet, while someone watches me, and makes me elucidate my mental states? But the other reason I&#8217;d have a problem is that I would recognize this problem as one I have already coded and place on Github (<a href="https://github.com/willf/microsoft_ngram">https://github.com/willf/microsoft_ngram</a>) which is in turn based on Peter Norvig&#8217;s section on segmentation in the book Beautiful Data (for which he posted <a href="http://norvig.com/ngrams/">Python code</a>). This code does segmentation even <em>better</em> than the Tunkaleng&#8217;s formulation of the problem, in that it provides <em>the most probable</em> segmentation (Tunkaleng&#8217;s formulation asked for any reasonable segmentation). I also know (or at least believe) that Norvig&#8217;s code is based on similar code he wrote for his Artificial Intelligence textbook (here&#8217;s the <a href="http://aima-python.googlecode.com/svn/trunk/text.py">code</a>) and that using memoization as a lazy person&#8217;s method of doing dynamic programming is an old and efficient trick of Norvig&#8217;s. This knowledge would basically drain me of any interest in reinventing this particular wheel.</p>
<p>In other words, I would fail this interview, although I have publicly demonstrated my ability to implement a more generally useful version. Despite Tunkaleng being a vastly better programmer, he got it wrong on the first try (he had an &#8220;off-by-one&#8221; error, which he later corrected). Granted, interviewers (or interviewers in the Google style) are generally more concerned about precision (avoiding hiring bad programmers) than recall (avoiding missing out on good programmers). But I continue to think that the whiteboard method systematically discriminates against certain classes of programmers. I suspect, in particular, it discriminates against older candidates, which, <em>prima facie</em> would be illegal under US law.</p>
<p>[Update, added later the same day]. Take the &#8220;I suspect&#8221; literally in the last paragraph. I&#8217;m still trying to articulate what I think about interviewing techniques of this type and discrimination, particularly age discrimination (the only kind I&#8217;m potentially subject to). And take <em>prima facie</em> literally, too&#8211;I doubt that, in a court of law, age discrimination could be proved based solely on these interviews.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/willwhim.wordpress.com/1111/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/willwhim.wordpress.com/1111/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/willwhim.wordpress.com/1111/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/willwhim.wordpress.com/1111/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/willwhim.wordpress.com/1111/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/willwhim.wordpress.com/1111/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/willwhim.wordpress.com/1111/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/willwhim.wordpress.com/1111/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/willwhim.wordpress.com/1111/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/willwhim.wordpress.com/1111/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/willwhim.wordpress.com/1111/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/willwhim.wordpress.com/1111/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/willwhim.wordpress.com/1111/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/willwhim.wordpress.com/1111/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=willwhim.wordpress.com&amp;blog=9342345&amp;post=1111&amp;subd=willwhim&amp;ref=&amp;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://willwhim.wordpress.com/2011/08/09/on-retiring-a-great-interview-problem/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/4b85e6b127c527c8dcebe18d1c985e48?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">asimpledesire</media:title>
		</media:content>
	</item>
	</channel>
</rss>
