{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Handling text with Python"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get started"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text1 = 'In this world nothing can be said to be certain except death and taxes'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(text1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text2 = text1.split(' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(text2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Finding Specific Words"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Words that are more than 5 letters long"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "[w for w in text2 if len(w) > 5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Words that end with  'd'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "[w for w in text2 if w.endswith('d')]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Words that start with  'd'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "[w for w in text2 if w.startswith('d')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "[w for w in text2 if w.startswith(('d','n'))]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Using set()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Finding unique words from a list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text3 = 'to be or not to be'\n",
    "text4 = text3.split(' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text5 = set(text4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(text5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### set vs list performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import random\n",
    "\n",
    "NUMBER_OF_ELEMENTS = 10000\n",
    "\n",
    "# Create a list\n",
    "lst = list(range(NUMBER_OF_ELEMENTS))\n",
    "random.shuffle(lst)\n",
    "\n",
    "# Crt=eate a set from the list\n",
    "s = set(lst)\n",
    "\n",
    "# Test if an element is in the set\n",
    "start = time.time()\n",
    "for i in range(NUMBER_OF_ELEMENTS):\n",
    "    i in s\n",
    "end = time.time()\n",
    "print(f'To test if {NUMBER_OF_ELEMENTS} elements are in the set, runtime is {end-start} seconds')\n",
    "\n",
    "# Test if an element is in the list\n",
    "start = time.time()\n",
    "for i in range(NUMBER_OF_ELEMENTS):\n",
    "    i in lst\n",
    "end = time.time()\n",
    "print(f'To test if {NUMBER_OF_ELEMENTS} elements are in the list, runtime is {end-start} seconds')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. More String Operations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### t in s - To check whether a substring exists in a given string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'Python' in 'Python is good'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### string.isupper() - To check whether all the characters of the string are uppercase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'PYTHON IS GOOD'.isupper()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'Python is good'.isupper()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### string.islower() - To check whether all the characters of the string are lowercase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'python is good'.islower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'Python is good'.islower()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### string.istitle() - To check whether each word in the string starts with an uppercase letter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'Python Is Good'.istitle()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'Python is good'.istitle()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### string.isdigit() - To check whether the string contains digits only"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'000002'.isdigit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'SZ000002'.isdigit()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### string.isalpha() - To check whether the string contains alphabetic characters only"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'Textmining'.isalpha()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'Text mining1'.isalpha()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### string.isalnum() - To check whether the string contains alphanumeric characters only"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'SZ000002'.isalnum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'SZ_000002#'.isalnum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Conversion between uppercase and lowercase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1 = 'python is good'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### string.upper() - Returns a string in which all characters are uppercased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s2 = s1.upper()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### string.lower() - Returns a string in which all characters are lowercased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s2.lower()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### string.capitalize() - Returns a string with only its first character capitalized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1.capitalize()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### string.title() - Returns a string in which first characters of all the words are capitalized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1.title()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### string.split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s3 = 'cattcatt'\n",
    "s4 = s3.split('a')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s4"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### join()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'a'.join(s4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get all the characters of s3 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(s3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "[c for c in s3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### String formatting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'{} {}'.format('hello', 'world')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'{} {}'.format(24, 'seconds')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "24 + 'seconds' "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### f-string "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = 'Eric'\n",
    "age = 74"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f'Hello, {name}. You are {age}.'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### string.strip() – Removes whitespaces at the beginning and at the end of the string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s5 = '  a quick brown fox jumped over the lazy dog  '\n",
    "s6 = s5.strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s6"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### sring.replace(old, new) - Returns a string where all occurrences of the old substring are replaced with the new substring"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s6.replace('o', 'O')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s6.replace('o', 'O', 2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### string.translate(table) - The translate() method returns a string where each character is mapped to its corresponding character in the translation table. The translation table is created by the static method maketrans()."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "intab = 'aeiou'\n",
    "outtab = '12345'\n",
    "table = str.maketrans(intab, outtab)\n",
    "s7 = 'aeiou-xmppp'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s7.translate(table)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "table_1 = str.maketrans(intab, outtab, 'xm')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s7.translate(table_1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Index and Slice String"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Accessing characters by positive index number"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = 'Hello World!'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s[4]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Accessing characters by negative index number"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s[-3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Slicing strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s[1:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s[-4:-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s[-2:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Specifying stride while slicing strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s[:5:1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s[:5:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s[::-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s[-1:-7:-2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### String.find() - Return the index of the first occurrence of the substring"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s.find('o')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s.find('or')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Writing to and Reading from CSV File"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Writing to a csv file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('test.csv', 'w', encoding='utf8', newline='') as wf:\n",
    "    writer = csv.writer(wf)\n",
    "    writer.writerow(('张三','北京'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reading from a csv file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('test.csv', 'r', encoding='utf8') as rf:\n",
    "    r = csv.reader(rf)\n",
    "    for row in r:\n",
    "        print(f'姓名:{row[0]}, 住址:{row[1]}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exercise1\n",
    "## Write a Python program to get a string from a given string where all occurrences of its first char have been changed to '@' except the first char itself.\n",
    "### Sample String : 'restart'\n",
    "### Expected Result : 'resta@t'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def change_char(str1):\n",
    "    ### START CODE HERE ###\n",
    "\n",
    "    ### END CODE HERE ###\n",
    "    return str1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check your function\n",
    "print(change_char('restart'))\n",
    "print(change_char('text'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Expected output\n",
    "```\n",
    "resta@t\n",
    "tex@\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exercise2\n",
    "## Given an input string with the combination of the lower and upper case, arrange characters in such a way that all lowercase letters should come first.\n",
    "### Sample String : 'PyNaTive'\n",
    "### Expected Result : 'yaivePNT'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def arrange_chars(str1):\n",
    "   \n",
    "  ### START CODE HERE ###\n",
    "\n",
    "  ### END CODE HERE ###\n",
    "  return str1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check your function\n",
    "print(arrange_chars('PyNaTive'))\n",
    "print(arrange_chars('OpTYabi'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Expected output\n",
    "```\n",
    "yaivePNT\n",
    "pabiOTY\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exercise3\n",
    "## Write a Python function that takes a list of words and returns the word with the largest length.\n",
    "### Sample List : [\"Python\", \"Text\", \"Analysis\"]\n",
    "### Expected Result : 'Analysis'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_longest_word(words_list):\n",
    "    longest_word = ''\n",
    "    max_len = 0\n",
    "    ### START CODE HERE ###\n",
    "\n",
    "    ### END CODE HERE ###\n",
    "    return longest_word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check your function\n",
    "print(find_longest_word([\"Python\", \"Text\", \"Analysis\"]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Expected output\n",
    "```\n",
    "Analysis\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}