@inproceedings(BarR02, author = "Ziv Bar-Yossef and Sridhar Rajagopalan", year = "2002", title = "{T}emplate detection via data mining and its applications", booktitle = "{P}roceedings of the 11th {I}nternational {C}onference on {W}orld {W}ide {W}eb ({WWW}'02)", publisher = "{ACM}", address = "{N}ew {Y}ork, {NY}, {USA}", pages = "580--591", doi = "10.1145/511446.511522", ) @inproceedings(BarCKS08, author = "Marco Baroni and Francis Chantree and Adam Kilgarriff and Serge Sharoff", year = "2008", title = "{C}leaneval: a {C}ompetition for {C}leaning {W}eb {P}ages", booktitle = "{P}roceedings of the {I}nternational {C}onference on {L}anguage {R}esources and {E}valuation ({LREC}'08)", publisher = "{E}uropean {L}anguage {R}esources {A}ssociation", pages = "638--643", url = "http://www.lrec-conf.org/proceedings/lrec2008/summaries/162.html", ) @inproceedings(BurR09, author = "Radek Burget and Ivana Rudolfova", year = "2009", title = "{W}eb {P}age {E}lement {C}lassification {B}ased on {V}isual {F}eatures", booktitle = "{P}roceedings of the 1st {A}sian {C}onference on {I}ntelligent {I}nformation and {D}atabase {S}ystems ({ACIIDS}'09)", publisher = "{IEEE} {C}omputer {S}ociety", address = "{W}ashington, {DC}, {USA}", pages = "67--72", doi = "10.1109/ACIIDS.2009.71", ) @inproceedings(CarJLRC11, author = "Eduardo Cardoso and Iam Jabour and Eduardo Laber and Rog\'erio Rodrigues and Pedro Cardoso", year = "2011", title = "{A}n efficient language-independent method to extract content from news webpages", booktitle = "{P}roceedings of the 11th {ACM} symposium on {D}ocument {E}ngineering ({D}oc{E}ng'11)", publisher = "{ACM}", address = "{N}ew {Y}ork, {NY}, {USA}", pages = "121--128", doi = "10.1145/2034691.2034720", ) @inproceedings(Cha01, author = "Soumen Chakrabarti", year = "2001", title = "{I}ntegrating the {D}ocument {O}bject {M}odel with hyperlinks for enhanced topic distillation and information extraction", booktitle = "{P}roceedings of the 10th {I}nternational {C}onference on {W}orld {W}ide {W}eb ({WWW}'01)", publisher = "{ACM}", address = "{N}ew {Y}ork, {NY}, {USA}", pages = "211--220", doi = "10.1145/371920.372054", ) @misc(DOM, author = "{W3C} {C}onsortium", year = "1997", title = "{D}ocument {O}bject {M}odel ({DOM})", howpublished = "{A}vailable from {URL}: \url {http://www.w3.org/{DOM}/}", ) @inproceedings(FerZBB08, author = "Adriano Ferraresi and Eros Zanchetta and Marco Baroni and Silvia Bernardini", year = "2008", title = "{I}ntroducing and evaluating {ukWaC}, a very large web-derived corpus of english", booktitle = "{P}roceedings of the 4th {W}eb as {C}orpus {W}orkshop ({WAC}-4)", pages = "47--54", ) @inproceedings(GibPT05, author = "David Gibson and Kunal Punera and Andrew Tomkins", year = "2005", title = "{T}he volume and evolution of web page templates", editor = "Allan Ellis and Tatsuya Hagino", booktitle = "{P}roceedings of the 14th {I}nternational {C}onference on {W}orld {W}ide {W}eb ({WWW}'05)", publisher = "{ACM}", pages = "830--839", doi = "10.1145/1062745.1062763", ) @inproceedings(Got08, author = "Thomas Gottron", year = "2008", title = "{C}ontent {C}ode {B}lurring: {A} {N}ew {A}pproach to {C}ontent {E}xtraction", editor = "A. Min Tjoa and Roland R. Wagner", booktitle = "{P}roceedings of the 19th {I}nternational {W}orkshop on {D}atabase and {E}xpert {S}ystems {A}pplications ({DEXA}'08)", publisher = "{IEEE} {C}omputer {S}ociety", pages = "29--33", doi = "10.1109/DEXA.2008.43", ) @article(InsST13, author = "David Insa and Josep Silva and Salvador Tamarit", year = "2013", title = "{U}sing the words/leafs ratio in the {DOM} tree for content extraction", journal = "{T}he {J}ournal of {L}ogic and {A}lgebraic {P}rogramming", volume = "82", number = "8", pages = "311--325", doi = "10.1016/j.jlap.2013.01.002", ) @inproceedings(Koh09, author = "Christian Kohlsch\"utter", year = "2009", title = "{A} densitometric analysis of web template content", editor = "Juan Quemada and Gonzalo Le\'on and Yo\"elle S. Maarek and Wolfgang Nejdl", booktitle = "{P}roceedings of the 18th {I}nternational {C}onference on {W}orld {W}ide {W}eb ({WWW}'09)", publisher = "{ACM}", pages = "1165--1166", doi = "10.1145/1526709.1526909", ) @inproceedings(KohFN10, author = "Christian Kohlsch\"utter and Peter Fankhauser and Wolfgang Nejdl", year = "2010", title = "{B}oilerplate detection using shallow text features", editor = "Brian D. Davison and Torsten Suel and Nick Craswell and Bing Liu", booktitle = "{P}roceedings of the 3th {I}nternational {C}onference on {W}eb {S}earch and {W}eb {D}ata {M}ining ({WSDM}'10)", publisher = "{ACM}", pages = "441--450", doi = "10.1145/1718487.1718542", ) @inproceedings(KohN08, author = "Christian Kohlsch\"utter and Wolfgang Nejdl", year = "2008", title = "{A} densitometric approach to web page segmentation", editor = "James G. Shanahan and Sihem Amer-Yahia and Ioana Manolescu and Yi Zhang and David A. Evans and Aleksander Kolcz and Key-Sun Choi and Abdur Chowdhury", booktitle = "{P}roceedings of the 17th {ACM} {C}onference on {I}nformation and {K}nowledge {M}anagement ({CIKM}'08)", publisher = "{ACM}", pages = "1173--1182", doi = "10.1145/1458082.1458237", ) @inproceedings(ReiGSL04, author = "Davi de Castro Reis and Paulo Braz Golgher and Altigran Soares Silva and Alberto Henrique Frade Laender", year = "2004", title = "{A}utomatic web news extraction using tree edit distance", booktitle = "{P}roceedings of the 13th {I}nternational {C}onference on {W}orld {W}ide {W}eb ({WWW}'04)", publisher = "{ACM}", address = "{N}ew {Y}ork, {NY}, {USA}", pages = "502--511", doi = "10.1145/988672.988740", ) @article(Tai79, author = "Kuo Chung Tai", year = "1979", title = "{T}he {T}ree-to-{T}ree {C}orrection {P}roblem", journal = "{J}ournal of the {ACM}", volume = "26", number = "3", pages = "422--433", doi = "10.1145/322139.322143", ) @inproceedings(VieSPMCF06, author = "Karane Vieira and Altigran S. da Silva and Nick Pinto and Edleno S. de Moura and Jo\~{a}o M. B. Cavalcanti and Juliana Freire", year = "2006", title = "{A} fast and robust method for web page template detection and removal", booktitle = "{P}roceedings of the 15th {ACM} {I}nternational {C}onference on {I}nformation and {K}nowledge {M}anagement ({CIKM}'06)", publisher = "{ACM}", address = "{N}ew {Y}ork, {NY}, {USA}", pages = "258--267", doi = "10.1145/1183614.1183654", ) @inproceedings(WenHH10, author = "Tim Weninger and William Henry Hsu and Jiawei Han", year = "2010", title = "{CETR}: {C}ontent {E}xtraction via {T}ag {R}atios", editor = "Michael Rappa and Paul Jones and Juliana Freire and Soumen Chakrabarti", booktitle = "{P}roceedings of the 19th {I}nternational {C}onference on {W}orld {W}ide {W}eb ({WWW}'10)", publisher = "{ACM}", pages = "971--980", doi = "10.1145/1772690.1772789", ) @inproceedings(YiLL03, author = "Lan Yi and Bing Liu and Xiaoli Li", year = "2003", title = "{E}liminating noisy information in Web pages for data mining", booktitle = "{P}roceedings of the 9th {ACM} {SIGKDD} {I}nternational {C}onference on {K}nowledge {D}iscovery and {D}ata mining ({KDD}'03)", publisher = "{ACM}", address = "{N}ew {Y}ork, {NY}, {USA}", pages = "296--305", doi = "10.1145/956750.956785", )