{ "cells": [ { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from biased_stop_words import get_stop_words\n", "from faker import Faker\n", "import random" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "fake = Faker()\n", "fake.text()\n", "\n", "polluting_words = get_stop_words('us-common-names')" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "documents = []\n", "for _ in range(10):\n", " words = fake.words(nb=100)\n", " documents.append(fake.sentence(nb_words=25, ext_word_list=words))\n", " \n", "polluted_documents = []\n", "for _ in range(10):\n", " words = fake.words(nb=95) + random.sample(polluting_words, 5)\n", " polluted_documents.append(fake.sentence(nb_words=25, ext_word_list=words))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Policy change window sometimes help interview management health whose attack red image true true market sometimes policy girl close hair hair town success.',\n", " 'Region great subject decade author chance debate position discussion feeling heart data include develop fear.',\n", " 'Finally fall little personal hard charge theory front time use use myself without would car media purpose.',\n", " 'Available out nature two security despite far recent because edge down major sign nature role fine simply concern security security fine side purpose available business because major issue security Mr role adult manage member.',\n", " 'Sign event positive material behind event picture then strong type total serious campaign total behind black still hotel same food remember society investment kind add yourself development federal apply same still.',\n", " 'Fly safe collection former stop each so learn sense seven list bag wind lead ball politics imagine fly need.',\n", " 'Evidence region beautiful energy sit face beautiful force whatever me water bit sit energy bank agree girl.',\n", " 'Gas authority next save knowledge child last certain marriage already weight occur build soldier subject wide deep share feeling low soon difficult foot amount child only oil buy buy own.',\n", " 'Employee make feel whatever result hot foreign size discover probably happy series agent feel ready mother several much suggest remember result ball report very prevent.',\n", " 'Ball both last less Congress produce agent method understand administration act sister time church writer article respond appear should drop fight art Congress health last dream weight article carry ball into.']" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "documents" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Rate pretty jenna discuss wind money put do people bed safe expert level dark some determine white party expert tend finally.',\n", " 'Even itself let employee alexandra popular paxton seem about now character score likely foreign news see another your couple let end recent paxton cut sense even.',\n", " 'Significant other as worker source cost like character him eye brother marshall other finish suffer painting again service candidate full fish attention author many madalyn.',\n", " 'Wonder gas consumer forget score three us relate information luella amount or consumer magazine amount because high luella total itself because gas rich pull consumer build employee plan high art three because staff.',\n", " 'Teach neal sell past away citizen neal feel brother baby ago son maybe sound attention despite red simply too its important sometimes hold close green site health off simply after.',\n", " 'Building hear citizen they dog table song rather billion person skin join mother stand loss growth enter like move fine available treat east home this job blood paper rather drug rate rate provide.',\n", " 'Usually usually peace box feeling baby kind do allow fund gabriel together feeling do future middle easy successful feeling current indicate authority father minute usually recently beyond should current why feeling concern need.',\n", " 'Hope serve dog property year traditional how dog nice near property hospital which seat save machine institution side international.',\n", " 'Knowledge boy sport also phone simple beautiful officer mother her sienna ball record herself door sienna rock herself freddy I ball phone ball worry fish own religious concern.',\n", " 'Crime throw line east only matter building place music candidate society compare challenge new must agreement leg throw you.']" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "polluted_documents" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 2 }