|
|||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object | +--org.openprivacy.reptile.RSSContentSerializer
Handles serializing HTML content to RSS 1.0
Field Summary | |
static java.lang.String |
COMPONENT_VERSION
|
static boolean |
DEBUG
When true we enable debug mode which prints out information about processing. |
static boolean |
INCLUDE_ACCEPTABLE_WITHIN_FIRSTLEVEL
If true, acceptable elements are also considered first level elements. |
static boolean |
INCLUDE_FORMS
|
static int |
MAX_DESCRIPTION_LENGTH
|
static int |
MAX_TITLE_WIDTH
|
static int |
MIN_CDATA_LENGTH
Variable used to detect if a piece of text is valid CDATA. |
static int |
MIN_DESCRIPTION_LENGTH
Keep adding stripped PCDATA to the description until it is at least this width. |
static int |
MIN_JUNK_DATA_PERCENTAGE
Minimum amount of junk data in PCDATA after which we consider the whole thing junk. |
static int |
MIN_TITLE_WIDTH
|
static int |
MODE_ANCHOR
Mode for matching A name sections. |
static int |
MODE_FLEXIBLE
Flexible parse mode. |
static int |
MODE_MINIMAL
Minimal parse mode. |
static java.lang.String |
USER_AGENT_STRING
|
Constructor Summary | |
RSSContentSerializer()
Create a new RSSContentSerializer instance. |
Method Summary | |
java.lang.String |
cleanseEntities(java.lang.String data)
|
java.lang.String |
cleanseHTML(java.lang.String html)
Cleanse pcdata of junk. |
java.lang.String |
cleansePCDATA(java.lang.String pcdata)
Clean up PCDATA so it is better for RSS - remove fonts - remove |
java.lang.String |
cleanseTitle(java.lang.String title)
Cleanse a title so that it can be represented correctly.. |
java.lang.String |
delete(java.lang.String begin_regexp,
java.lang.String end_regexp,
java.lang.String pcdata)
Delete the region between the two regexps and return the two strings. |
java.lang.String |
expand(java.lang.String link)
Expand a link relavant to the current site. |
java.lang.String |
getBase()
Get the base of this URL. |
java.lang.String |
getContent()
Return all the content for this item. |
int |
getContentStrippedLength()
Get the length of all the stripped content. |
java.lang.String |
getDescription()
Get the value of description . |
java.lang.String |
getHTML()
Get the value of html . |
boolean |
getInitialized()
|
int |
getMinRepassContentLength()
Get the minimum amount of content we need befoe another repass The minimum amount of content we need to do a second pass with td, br elements. |
int |
getMode()
Get the mode we are operating in. |
org.openprivacy.reptile.RSSContentSerializer.PCDATASection[] |
getPCDATASections()
Get all PCDATA entries that were found. |
java.lang.String |
getResource()
Get the value of resource . |
java.lang.String |
getResourceAsString()
|
java.lang.String |
getRSS()
Get the resource as an RSS stream with mod_content |
java.lang.String |
getSite()
Get the site for this resource. |
java.lang.String |
getTitle()
Get the value of title . |
java.lang.String |
getTitle(java.lang.String description)
Attempt to pull out the title from the given description |
void |
init()
Initialize this if it hasn't been done. |
boolean |
isAcceptablePCDATA(org.openprivacy.reptile.RSSContentSerializer.PCDATASection section)
Return true if this is an acceptable PCDATASection. |
boolean |
isHolderElement(java.lang.String local_name)
Return true if the given local_name is a holder than can format HTML across a paragraph. |
boolean |
isJunkContent(java.lang.String content)
Return true if this is junk content. |
static void |
main(java.lang.String[] args)
Handle operations from the command line. |
void |
parse()
Parse this channel. |
java.lang.String |
relativize(java.lang.String content)
Used to fix relative links in HTML content so that everything is expanded. |
void |
setDescription(java.lang.String description)
Set the value of description . |
void |
setHTML(java.lang.String html)
Set the value of html . |
void |
setInitialized(boolean initialized)
|
void |
setModeMinimal()
Set minimal mode and all options. |
void |
setResource(java.lang.String resource)
Set the value of resource . |
void |
setTitle(java.lang.String title)
Set the value of title . |
java.lang.String |
strip(java.lang.String content)
Strip all elements from the given content. |
java.lang.String |
truncate(java.lang.String value,
int length)
Truncate the given value so that |
Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
public static final java.lang.String COMPONENT_VERSION
public static final java.lang.String USER_AGENT_STRING
public static final int MIN_TITLE_WIDTH
public static final int MAX_TITLE_WIDTH
public static final int MIN_JUNK_DATA_PERCENTAGE
public static final boolean INCLUDE_ACCEPTABLE_WITHIN_FIRSTLEVEL
public static final int MODE_MINIMAL
public static final int MODE_FLEXIBLE
public static final int MODE_ANCHOR
public static final int MIN_DESCRIPTION_LENGTH
public static final int MAX_DESCRIPTION_LENGTH
public static final int MIN_CDATA_LENGTH
This is the title
" but I believe this is acceptable
public static final boolean DEBUG
public static final boolean INCLUDE_FORMS
Constructor Detail |
public RSSContentSerializer()
RSSContentSerializer
instance.
Method Detail |
public java.lang.String getHTML()
html
.
public void setHTML(java.lang.String html)
html
.
public java.lang.String getResourceAsString() throws java.lang.Exception
java.lang.Exception
public java.lang.String getTitle()
title
.
public void setTitle(java.lang.String title)
title
.
public java.lang.String getDescription()
description
.
public void setDescription(java.lang.String description)
description
.
public java.lang.String getResource()
resource
.
public void setResource(java.lang.String resource)
resource
.
public void init() throws java.lang.Exception
java.lang.Exception
public void parse() throws java.lang.Exception
java.lang.Exception
public java.lang.String getRSS() throws java.lang.Exception
java.lang.Exception
public java.lang.String strip(java.lang.String content) throws java.lang.Exception
java.lang.Exception
public java.lang.String relativize(java.lang.String content) throws java.lang.Exception
java.lang.Exception
public static void main(java.lang.String[] args)
public org.openprivacy.reptile.RSSContentSerializer.PCDATASection[] getPCDATASections()
public java.lang.String getContent()
public java.lang.String getBase()
public java.lang.String getSite()
public java.lang.String expand(java.lang.String link) throws java.lang.Exception
java.lang.Exception
public boolean isJunkContent(java.lang.String content) throws java.lang.Exception
java.lang.Exception
public int getMode()
public void setModeMinimal()
public java.lang.String cleanseHTML(java.lang.String html) throws java.lang.Exception
java.lang.Exception
public java.lang.String delete(java.lang.String begin_regexp, java.lang.String end_regexp, java.lang.String pcdata) throws java.lang.Exception
java.lang.Exception
public java.lang.String cleansePCDATA(java.lang.String pcdata) throws java.lang.Exception
java.lang.Exception
public int getContentStrippedLength()
public boolean isAcceptablePCDATA(org.openprivacy.reptile.RSSContentSerializer.PCDATASection section)
public boolean isHolderElement(java.lang.String local_name)
public void setInitialized(boolean initialized)
public boolean getInitialized()
public java.lang.String truncate(java.lang.String value, int length)
public java.lang.String getTitle(java.lang.String description) throws java.lang.Exception
java.lang.Exception
public int getMinRepassContentLength() throws java.lang.Exception
java.lang.Exception
public java.lang.String cleanseEntities(java.lang.String data) throws java.lang.Exception
java.lang.Exception
public java.lang.String cleanseTitle(java.lang.String title) throws java.lang.Exception
java.lang.Exception
|
|||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |