{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'HUEHUEHUE'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(\"hue\" * 3).upper()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "was was was\n", "was was is\n" ] } ], "source": [ "str = \"is is is\";\n", "print(str.replace(\"is\", \"was\"))\n", "print(str.replace(\"is\", \"was\", 2))" ] }, { "cell_type": "code", "execution_count": 132, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'world'" ] }, "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\" \\tworld\\n \".strip() # strip, lstrip, rstrip" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "b'946809' ### 946809\n" ] } ], "source": [ "print(str(b'946809'), end = \" ### \" )\n", "print(b'946809'.decode('utf8') )" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'3.56'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"%.2f\" % float(\"3.5555\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "bytes" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type('foo'.encode('utf-8'))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ParseResult(scheme='http', netloc='imgqn.xxx.com', path='/upload_files/2015/05/29/yyy.jpg!730x0.jpg', params='', query='', fragment='')" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from urllib.parse import urlparse\n", "\n", "def get_filename_from_url(url):\n", " return urlparse(url).path.split('/')[-1]\n", "\n", "url = \"http://imgqn.xxx.com/upload_files/2015/05/29/yyy.jpg!730x0.jpg\"\n", "urlparse(url)" ] }, { "cell_type": "code", "execution_count": 240, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from bs4 import BeautifulSoup # analyze html\n", "#http://www.crummy.com/software/BeautifulSoup/bs4/doc/\n", "soup = BeautifulSoup(html_doc)\n", "soup.p['class']\n", "soup.find_all('a')\n", "soup.find_all('img', src=True):\n", "soup.find_all(\"div\", { \"class\" : \"xxx\"}) \n", "soup.find(id=\"link3\")\n", "# Tillie" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Regex\n", "https://www.regex101.com/ \n", "http://regexr.com/ \n", "https://docs.python.org/3/library/re.html" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'your name your name'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "re.sub(r'(?i)\\b(u|you+)\\b', \"your name\", 'u YOU')" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": true }, "outputs": [], "source": [ "re.match(\"c\", \"abcdef\") # checks for a match only at the beginning of the string, No match" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<_sre.SRE_Match object; span=(2, 3), match='c'>" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.search(\"c\", \"abcdef\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n" ] }, { "data": { "text/plain": [ "<_sre.SRE_Match object; span=(0, 1), match='1'>" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "m = re.search(\"\\w\",'1dfsde2') # \\w match as well as numbers and the underscore\n", "if m: \n", " print(m.group(0))\n", "m " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "import re\n", "pattern=\"BEGIN:VCARD.*?END:VCARD\"\n", "result = re.findall(pattern,content,re.DOTALL) " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Positive Lookbehind:\ta*c a b c\n", "Negative Lookbehind:\tabc a * c\n", "Positive Lookahead:\ta*c a b c\n", "Negative Lookahead:\tabc a * c\n" ] } ], "source": [ "import re\n", "print( 'Positive Lookbehind:\\t' + re.sub(u'(?<=a)b', \"*\", 'abc a b c') )\n", "print( 'Negative Lookbehind:\\t' + re.sub(u'(?