News:

Choose a design and let our professionals help you build a successful website   - ITAcumens

Main Menu

Different method of converting web page(html2text)

Started by Kalyan, Mar 23, 2008, 04:33 PM

Previous topic - Next topic

Kalyan

Different method of converting web page(html2text)

This project helps to have text of any web page like A to Z Amazon to Google to Yahoo any one.

import java.io.File;
import java.io.FileInputStream;

import java.io.Reader;
import java.io.StringReader;

import java.io.IOException;
public class HTML2Text
{
   boolean body_found = false;
   boolean in_body = false;
   boolean center = false;
   boolean pre = false;
   String href = "";

   public String convert(String source) throws Exception
   {
      StringBuffer result = new StringBuffer();
      StringBuffer result2 = new StringBuffer();
      StringReader input = new StringReader(source);

      try
      {
      String text = null;
      int c = input.read();

      while (c != -1) // Convert until EOF
      {
      text = "";
      if (c == '<') // It's a tag!!
      {
      String CurrentTag = getTag(input); // Get the rest of the tag
      text = convertTag(CurrentTag);
      }
      else if (c == '&')
      {
      String specialchar = getSpecial(input);
      if (specialchar.equals("lt;") || specialchar.equals("#60"))
      text = "<";
      else if (specialchar.equals("gt;") || specialchar.equals("#62"))
      text = ">";
      else if (specialchar.equals("amp;") || specialchar.equals("#38"))
      text = "&";
      else if (specialchar.equals("nbsp;"))
      text = " ";
      else if (specialchar.equals("quot;") || specialchar.equals("#34"))
      text = """;
      else if (specialchar.equals("copy;") || specialchar.equals("#169"))
      text = "[Copyright]";
      else if (specialchar.equals("reg;") || specialchar.equals("#174"))
      text = "[Registered]";
      else if (specialchar.equals("trade;") || specialchar.equals("#153"))
      text = "[trademark]";
      else
      text = "&" + specialchar;
      }
      else if (!pre && Character.isWhitespace((char)c))
      {
      StringBuffer s = in_body ? result : result2;
      if (s.length() > 0 && Character.isWhitespace(s.charAt(s.length()-1)))
         text = "";
      else text = " ";
      }
      else
      {
      text = "" + (char)c;
      }

      StringBuffer s = in_body ? result : result2;
      s.append(text);

      c = input.read();
      }
      }
      catch (Exception e)
      {
      input.close();
      throw e;
      }

      StringBuffer s = body_found ? result : result2;
      return s.toString().trim();
   }

   String getTag(Reader r) throws IOException
   {
      StringBuffer result = new StringBuffer();
      int level = 1;

      result.append('<');
      while (level > 0)
      {
      int c = r.read();
      if (c == -1) break; // EOF
      result.append((char)c);
      if (c == '<') level++; else if (c == '>') level--;
      }

      return result.toString();
   }

   String getSpecial(Reader r) throws IOException
   {
      StringBuffer result = new StringBuffer();
      r.mark(1);//Mark the present position in the stream
      int c = r.read();

      while (Character.isLetter((char)c))
      {
      result.append((char)c);
      r.mark(1);
      c = r.read();
      }

      if (c == ';') result.append(';');
      else r.reset();

      return result.toString();
   }

   boolean isTag(String s1, String s2)
   {
      s1 = s1.toLowerCase();
      String t1 = "<" + s2.toLowerCase() + ">";
      String t2 = "<" + s2.toLowerCase() + " ";

      return s1.startsWith(t1) || s1.startsWith(t2);
   }

   String convertTag(String t) throws IOException
   {
      String result = "";

      if (isTag(t,"body"))
      { in_body = true; body_found = true; }
      else if (isTag(t,"/body"))
      { in_body = false; result = "<BR>; }
      else if (isTag(t,"center"))
      { result = "<BR>; center = true; }
      else if (isTag(t,"/center"))
      { result = "<BR>; center = false; }
      else if (isTag(t,"pre"))
      { result = "<BR>; pre = true; }
      else if (isTag(t,"/pre"))
      { result = "<BR>; pre = false; }
      else if (isTag(t,"p"))
      result = "
<BR>;
      else if (isTag(t,"br"))
      result = "<BR>;
      else if (isTag(t,"h1") || isTag(t,"h2") ||
isTag(t,"h3") ||isTag(t,"h4") || isTag(t,"h5") || isTag(t,"h6") ||
isTag(t,"h7"))
      result = "<BR>;
      else if (isTag(t,"/h1") || isTag(t,"/h2") ||
isTag(t,"/h3") ||isTag(t,"/h4") || isTag(t,"/h5") || isTag(t,"/h6") ||
isTag(t,"/h7"))
      result = "<BR>;
      else if (isTag(t,"/dl"))
      result = "<BR>;
      else if (isTag(t,"dd"))
      result = "
  * ";
      else if (isTag(t,"dt"))
      result = "      ";
      else if (isTag(t,"li"))
      result = "
  * ";
      else if (isTag(t,"/ul"))
      result = "<BR>;
      else if (isTag(t,"/ol"))
      result = "<BR>;
      else if (isTag(t,"hr"))
      result = "_________________________________________
<BR>;
      else if (isTag(t,"table"))
      result = "<BR>;
      else if (isTag(t,"/table"))
      result = "<BR>;
      else if (isTag(t,"form"))
      result = "<BR>;
      else if (isTag(t,"/form"))
      result = "<BR>;
      else if (isTag(t,"b"))
      result = "*";
      else if (isTag(t,"/b"))
      result = "*";
      else if (isTag(t,"i"))
      result = """;
      else if (isTag(t,"/i"))
      result = """;
      else if (isTag(t,"img"))
      {
      int idx = t.indexOf("alt="");
      if (idx != -1)
      {
      idx += 5;
      int idx2 = t.indexOf(""",idx);
      result = t.substring(idx,idx2);
      }
      }
      else if (isTag(t,"a"))
      {
      int idx = t.indexOf("href="");
      if (idx != -1)
      {
      idx += 6;
      int idx2 = t.indexOf(""",idx);
      href = t.substring(idx,idx2);
      }
      else
      {
      href = "";
      }
      }
      else if (isTag(t,"/a"))
      {
      if (href.length() > 0)
      {
      result = " [ " + href + " ]";
      href = "";
      }
      }

      return result;
   }

   public static void main(String argv[]) throws Exception
   {
      FileInputStream fis = null;
      String s = null;

      try
      {
      File file;
      if (argv[0] != null) file = new File(argv[0]);
      else file = new File("html_test_file.html");
      fis = new FileInputStream(file);
      byte buf[] = new byte[fis.available()];
      //bytes that can be read from this file input stream without blocking

      fis.read(buf);
      fis.close();
      fis = null;
      s = new String(buf);
      HTML2Text h = new HTML2Text();
      System.out.println(h.convert(s));
      }
      catch (Exception e)
      {
      if (fis != null) fis.close();
      throw e;
      }
   }
}