{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Handling text with Python" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get started" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text1 = 'In this world nothing can be said to be certain except death and taxes'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(text1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text2 = text1.split(' ')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(text2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Finding Specific Words" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Words that are more than 5 letters long" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "[w for w in text2 if len(w) > 5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Words that end with 'd'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "[w for w in text2 if w.endswith('d')]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Words that start with 'd'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "[w for w in text2 if w.startswith('d')]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "[w for w in text2 if w.startswith(('d','n'))]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Using set()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Finding unique words from a list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text3 = 'to be or not to be'\n", "text4 = text3.split(' ')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text5 = set(text4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(text5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### set vs list performance" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import time\n", "import random\n", "\n", "NUMBER_OF_ELEMENTS = 10000\n", "\n", "# Create a list\n", "lst = list(range(NUMBER_OF_ELEMENTS))\n", "random.shuffle(lst)\n", "\n", "# Crt=eate a set from the list\n", "s = set(lst)\n", "\n", "# Test if an element is in the set\n", "start = time.time()\n", "for i in range(NUMBER_OF_ELEMENTS):\n", " i in s\n", "end = time.time()\n", "print(f'To test if {NUMBER_OF_ELEMENTS} elements are in the set, runtime is {end-start} seconds')\n", "\n", "# Test if an element is in the list\n", "start = time.time()\n", "for i in range(NUMBER_OF_ELEMENTS):\n", " i in lst\n", "end = time.time()\n", "print(f'To test if {NUMBER_OF_ELEMENTS} elements are in the list, runtime is {end-start} seconds')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. More String Operations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### t in s - To check whether a substring exists in a given string" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'Python' in 'Python is good'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### string.isupper() - To check whether all the characters of the string are uppercase" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'PYTHON IS GOOD'.isupper()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'Python is good'.isupper()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### string.islower() - To check whether all the characters of the string are lowercase" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'python is good'.islower()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'Python is good'.islower()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### string.istitle() - To check whether each word in the string starts with an uppercase letter" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'Python Is Good'.istitle()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'Python is good'.istitle()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### string.isdigit() - To check whether the string contains digits only" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'000002'.isdigit()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'SZ000002'.isdigit()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### string.isalpha() - To check whether the string contains alphabetic characters only" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'Textmining'.isalpha()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'Text mining1'.isalpha()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### string.isalnum() - To check whether the string contains alphanumeric characters only" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'SZ000002'.isalnum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'SZ_000002#'.isalnum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Conversion between uppercase and lowercase" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s1 = 'python is good'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### string.upper() - Returns a string in which all characters are uppercased" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s2 = s1.upper()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### string.lower() - Returns a string in which all characters are lowercased" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s2.lower()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### string.capitalize() - Returns a string with only its first character capitalized" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s1.capitalize()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### string.title() - Returns a string in which first characters of all the words are capitalized" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s1.title()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### string.split()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s3 = 'cattcatt'\n", "s4 = s3.split('a')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### join()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'a'.join(s4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get all the characters of s3 " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "list(s3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "[c for c in s3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### String formatting" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'{} {}'.format('hello', 'world')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'{} {}'.format(24, 'seconds')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "24 + 'seconds' " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### f-string " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "name = 'Eric'\n", "age = 74" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f'Hello, {name}. You are {age}.'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### string.strip() – Removes whitespaces at the beginning and at the end of the string" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s5 = ' a quick brown fox jumped over the lazy dog '\n", "s6 = s5.strip()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s6" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### sring.replace(old, new) - Returns a string where all occurrences of the old substring are replaced with the new substring" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s6.replace('o', 'O')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s6.replace('o', 'O', 2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### string.translate(table) - The translate() method returns a string where each character is mapped to its corresponding character in the translation table. The translation table is created by the static method maketrans()." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "intab = 'aeiou'\n", "outtab = '12345'\n", "table = str.maketrans(intab, outtab)\n", "s7 = 'aeiou-xmppp'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s7.translate(table)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "table_1 = str.maketrans(intab, outtab, 'xm')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s7.translate(table_1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Index and Slice String" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Accessing characters by positive index number" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s = 'Hello World!'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s[4]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Accessing characters by negative index number" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s[-3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Slicing strings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s[1:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s[-4:-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s[-2:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Specifying stride while slicing strings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s[:5:1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s[:5:2]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s[::-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s[-1:-7:-2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### String.find() - Return the index of the first occurrence of the substring" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s.find('o')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s.find('or')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Writing to and Reading from CSV File" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Writing to a csv file" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open('test.csv', 'w', encoding='utf8', newline='') as wf:\n", " writer = csv.writer(wf)\n", " writer.writerow(('张三','北京'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading from a csv file" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open('test.csv', 'r', encoding='utf8') as rf:\n", " r = csv.reader(rf)\n", " for row in r:\n", " print(f'姓名:{row[0]}, 住址:{row[1]}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise1\n", "## Write a Python program to get a string from a given string where all occurrences of its first char have been changed to '@' except the first char itself.\n", "### Sample String : 'restart'\n", "### Expected Result : 'resta@t'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def change_char(str1):\n", " ### START CODE HERE ###\n", "\n", " ### END CODE HERE ###\n", " return str1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check your function\n", "print(change_char('restart'))\n", "print(change_char('text'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Expected output\n", "```\n", "resta@t\n", "tex@\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise2\n", "## Given an input string with the combination of the lower and upper case, arrange characters in such a way that all lowercase letters should come first.\n", "### Sample String : 'PyNaTive'\n", "### Expected Result : 'yaivePNT'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def arrange_chars(str1):\n", " \n", " ### START CODE HERE ###\n", "\n", " ### END CODE HERE ###\n", " return str1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check your function\n", "print(arrange_chars('PyNaTive'))\n", "print(arrange_chars('OpTYabi'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Expected output\n", "```\n", "yaivePNT\n", "pabiOTY\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Exercise3\n", "## Write a Python function that takes a list of words and returns the word with the largest length.\n", "### Sample List : [\"Python\", \"Text\", \"Analysis\"]\n", "### Expected Result : 'Analysis'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def find_longest_word(words_list):\n", " longest_word = ''\n", " max_len = 0\n", " ### START CODE HERE ###\n", "\n", " ### END CODE HERE ###\n", " return longest_word" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check your function\n", "print(find_longest_word([\"Python\", \"Text\", \"Analysis\"]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Expected output\n", "```\n", "Analysis\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }