{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# DataFrame cơ bản\n", "Tập dữ liệu phân tán biểu diễn dưới dạng dòng và cột như CSDL." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# SQLContext: Tạo DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Từ danh sách tuples" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Row(_1=u'Alice', _2=1)]\n", "[Row(name=u'Alice', age=1)]\n" ] } ], "source": [ "l = [(\"Alice\", 1)]\n", "print sqlContext.createDataFrame(l).collect()\n", "print sqlContext.createDataFrame(l, [\"name\", \"age\"]).collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Từ RDDs" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Row(_1=u'Alice', _2=1)]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rdd = sc.parallelize(l)\n", "sqlContext.createDataFrame(rdd).collect()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Row(name=u'Alice', age=1)]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = sqlContext.createDataFrame(rdd, [\"name\", \"age\"])\n", "df.collect()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Row(a=u'Alice', b=1)]\n", "[Row(value=1)]\n" ] } ], "source": [ "print sqlContext.createDataFrame(rdd, \"a: string, b: int\").collect()\n", "rdd = sc.parallelize(l)\n", "rdd = rdd.map(lambda row: row[1])\n", "print sqlContext.createDataFrame(rdd, \"int\").collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Từ Row" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Row(name=u'Alice', age=12)]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from pyspark.sql import Row\n", "rdd = sc.parallelize([(\"Alice\", 12)])\n", "Person = Row(\"name\", \"age\")\n", "person = rdd.map(lambda r: Person(*r))\n", "df2 = sqlContext.createDataFrame(person)\n", "df2.collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Từ Schema" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Row(name=u'Alice', age=12)]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from pyspark.sql.types import *\n", "schema = StructType([\n", " StructField(\"name\", StringType(), True),\n", " StructField(\"age\", IntegerType(), True)\n", " ])\n", "df3 = sqlContext.createDataFrame(rdd, schema)\n", "df3.collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Từ pandas" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Row(name=u'Alice', age=1)]\n", "[Row(0=1, 1=2)]\n" ] } ], "source": [ "import pandas\n", "print sqlContext.createDataFrame(df.toPandas()).collect()\n", "print sqlContext.createDataFrame(pandas.DataFrame([[1, 2]])).collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Chuyển đổi định dạng" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Row(f1=u'Alice', f2=1)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.toDF(\"f1\", \"f2\").collect()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "u'{\"name\":\"Alice\",\"age\":1}'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.toJSON().first()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | name | \n", "age | \n", "
---|---|---|
0 | \n", "Alice | \n", "1 | \n", "