{"id":1268,"date":"2017-05-05T00:01:31","date_gmt":"2017-05-05T00:01:31","guid":{"rendered":"http:\/\/10sa.com\/sql_stories\/?p=1268"},"modified":"2017-05-05T00:07:16","modified_gmt":"2017-05-05T00:07:16","slug":"spark-and-csv-and-sql","status":"publish","type":"post","link":"http:\/\/10sa.com\/sql_stories\/?p=1268","title":{"rendered":"Spark and CSV and SQL"},"content":{"rendered":"<p>SQL<\/p>\n<pre language=\"python\">\r\ndata_frame = spark.read.csv(\"\/db\/nowe\/APLUS_LOG_MINMAX_1183241001_LoggerData.csv\", header=True).select(\"ID\", \"UTC\").limit(200)\r\ndata_frame.createOrReplaceTempView(\"my_table\")\r\n# What happened? \r\n# data_frame.printSchema()\r\nspark.sql(\"desc my_table\").show()\r\n# Wow\r\n\r\n# data_frame.first()\r\nspark.sql(\"select * from my_table limit 1\").show()\r\n\r\n# data_frame.withColumnRenamed\r\nspark.sql(\"select ID as SOME_ID from my_table limit 1\").show()\r\n\r\n# Casting... data_frame.select(data_frame.ID.cast(\"float\")).show(2)\r\nspark.sql(\"select CAST(ID as FLOAT) as SOME_ID from my_table limit 1\").show()\r\n\r\n# Now, cast ID to float, then get only the ID which can be divided by 2\r\n# data_frame.select( (data_frame.ID.cast(\"float\") % 2).alias(\"IS_DIV_BY_TWO\")).filter(\"IS_DIV_BY_TWO < 1\").show(10)\r\nspark.sql(\"select ID, (ID  % 2) AS IS_DIV_BY_TWO from my_table HAVING IS_DIV_BY_TWO < 1 LIMIT 10\").show()\r\n\r\n# Group by with aliasing\r\n# data_frame.select( (data_frame.ID.cast(\"float\") % 2) .alias(\"IS_DIV_BY_TWO\")).groupby(\"IS_DIV_BY_TWO\").count().collect()\r\nspark.sql(\"SELECT md, count(*) FROM (select (ID%2) AS md FROM my_table) group by md\").show()\r\n\r\n<\/pre>\n","protected":false},"excerpt":{"rendered":"<p>SQL data_frame = spark.read.csv(&#8220;\/db\/nowe\/APLUS_LOG_MINMAX_1183241001_LoggerData.csv&#8221;, header=True).select(&#8220;ID&#8221;, &#8220;UTC&#8221;).limit(200) data_frame.createOrReplaceTempView(&#8220;my_table&#8221;) # What happened? # data_frame.printSchema() spark.sql(&#8220;desc my_table&#8221;).show() # Wow # data_frame.first() spark.sql(&#8220;select * from my_table limit 1&#8221;).show() # data_frame.withColumnRenamed spark.sql(&#8220;select ID as SOME_ID from my_table limit 1&#8221;).show() # Casting&#8230; data_frame.select(data_frame.ID.cast(&#8220;float&#8221;)).show(2) spark.sql(&#8220;select CAST(ID as FLOAT) as SOME_ID from my_table limit 1&#8221;).show() # Now, cast ID to float, then get [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[24],"tags":[],"_links":{"self":[{"href":"http:\/\/10sa.com\/sql_stories\/index.php?rest_route=\/wp\/v2\/posts\/1268"}],"collection":[{"href":"http:\/\/10sa.com\/sql_stories\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/10sa.com\/sql_stories\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/10sa.com\/sql_stories\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/10sa.com\/sql_stories\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=1268"}],"version-history":[{"count":5,"href":"http:\/\/10sa.com\/sql_stories\/index.php?rest_route=\/wp\/v2\/posts\/1268\/revisions"}],"predecessor-version":[{"id":1276,"href":"http:\/\/10sa.com\/sql_stories\/index.php?rest_route=\/wp\/v2\/posts\/1268\/revisions\/1276"}],"wp:attachment":[{"href":"http:\/\/10sa.com\/sql_stories\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=1268"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/10sa.com\/sql_stories\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=1268"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/10sa.com\/sql_stories\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=1268"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}