HTML A tag Regular Expression Pattern

(?i)<a([^>]+)>(.+?)</a>

Extract HTML link Regular Expression Pattern

\s*(?i)href\s*=\s*(\"([^"]*\")|'[^']*'|([^'">\s]+));

Description

(		#start of group #1
 ?i		#  all checking are case insensive
)		#end of group #1
<a              #start with "<a"
  (		#  start of group #2
    [^>]+	#     anything except (">"), at least one character
   )		#  end of group #2
  >		#     follow by ">"
    (.+?)	#	match anything 
         </a>	#	  end with "</a>
\s*			   #can start with whitespace
  (?i)			   # all checking are case insensive
     href		   #  follow by "href" word
        \s*=\s*		   #   allows spaces on either side of the equal sign,
              (		   #    start of group #1
               "([^"]*")   #      allow string with double quotes enclosed - "string"
               |	   #	  ..or
               '[^']*'	   #        allow string with single quotes enclosed - 'string'
               |           #	  ..or
               ([^'">]+)   #      can't contains one single quotes, double quotes ">"
	      )		   #    end of group #1

Here is a simple Java Link extractor to extract the ‘href’ value from 1st pattern, and use 2nd pattern to extract the link from 1st pattern value. Of course with some logic as below.

Java Regular Expression Example

package com.mkyong.regex;
 
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
public class HTMLLinkExtrator{
 
	  private Pattern patternTag, patternLink;
	  private Matcher matcherTag, matcherLink;
 
	  private static final String HTML_A_TAG_PATTERN = 
                      "(?i)<a([^>]+)>(.+?)</a>";
 
	  private static final String HTML_A_HREF_TAG_PATTERN = 
                      "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";
 
	  public HTMLLinkExtrator(){
		  patternTag = Pattern.compile(HTML_A_TAG_PATTERN);
		  patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN);
	  }
 
	  /**
	   * Validate html with regular expression
	   * @param html html content for validation
	   * @return Vector links and link text
	   */
	  public Vector<HtmlLink> grabHTMLLinks(final String html){
 
		  Vector<HtmlLink> result = new Vector<HtmlLink>();
 
		  matcherTag = patternTag.matcher(html);
 
		  while(matcherTag.find()){
 
			  String href = matcherTag.group(1); //href
			  String linkText = matcherTag.group(2); //link text
 
			  matcherLink = patternLink.matcher(href);
 
			  while(matcherLink.find()){
 
				  String link = matcherLink.group(1); //link
 
				  result.add(new HtmlLink(link, linkText));
 
			  }
 
		  }
 
		  return result;
 
	  }
 
	class HtmlLink {
 
		String link;
		String linkText;
 
		HtmlLink(String link, String linkText){
			this.link = link;
			this.linkText = linkText;
		}
 
		@Override
		public String toString() {
			return new StringBuffer("Link : ")
			.append(this.link)
			.append(" Link Text : ")
			.append(this.linkText).toString();
		}	    
	}
}

Unit Test – HTMLLinkExtratorTest

package com.mkyong.regex;
 
import java.util.Vector;
 
import org.testng.Assert;
import org.testng.annotations.*;
 
import com.mkyong.regex.HTMLLinkExtrator.HtmlLink;
 
/**
 * HTML link extrator Testing
 * @author mkyong
 *
 */
public class HTMLLinkExtratorTest {
 
  private HTMLLinkExtrator htmlLinkExtrator;
 
  @BeforeClass
  public void initData(){
	htmlLinkExtrator = new HTMLLinkExtrator();
  }
 
  @DataProvider
  public Object[][] HTMLContentProvider() {
    return new Object[][]{
     new Object[] {"abc hahaha <a href='http://www.google.com'>google</a>"},
     new Object[] {"abc hahaha <a HREF='http://www.google.com'>google</a>"},
     new Object[] {"abc hahaha <A HREF='http://www.google.com'>google</A> , " +
       "abc hahaha <A HREF='http://www.google.com' target='_blank'>google</A>"},
     new Object[] {"abc hahaha <A HREF='http://www.google.com' target='_blank'>google</A>"},
     new Object[] {"abc hahaha <A target='_blank' HREF='http://www.google.com'>google</A>"},
     new Object[] {"abc hahaha <a HREF=http://www.google.com>google</a>"},
   };
}
 
  @Test(dataProvider = "HTMLContentProvider")
  public void ValidHTMLLinkTest(String html) {
 
	Vector<HtmlLink> links = htmlLinkExtrator.grabHTMLLinks(html);
 
	Assert.assertTrue(links.size()!=0);
 
	for(int i=0; i<links.size() ; i++){
		HtmlLink htmlLinks = links.get(i);
		System.out.println(htmlLinks);
	}
 
  }	
}

Unit Test – Result

[Parser] Running:
  E:\workspace\mkyong\temp-testng-customsuite.xml
 
Link : 'http://www.google.com' Link Text : google
Link : 'http://www.google.com' Link Text : google
Link : 'http://www.google.com' Link Text : google
Link : 'http://www.google.com' Link Text : google
Link : 'http://www.google.com' Link Text : google
Link : 'http://www.google.com' Link Text : google
Link : http://www.google.com Link Text : google
PASSED: ValidHTMLLinkTest("abc hahaha <a href='http://www.google.com'>google</a>")
PASSED: ValidHTMLLinkTest("abc hahaha <a HREF='http://www.google.com'>google</a>")
PASSED: ValidHTMLLinkTest("abc hahaha <A HREF='http://www.google.com'>google</A> , 
abc hahaha <A HREF='http://www.google.com' target='_blank'>google</A>")
PASSED: ValidHTMLLinkTest("abc hahaha <A HREF='http://www.google.com' target='_blank'>google</A>")
PASSED: ValidHTMLLinkTest("abc hahaha <A target='_blank' HREF='http://www.google.com'>google</A>")
PASSED: ValidHTMLLinkTest("abc hahaha <a HREF=http://www.google.com>google</a>")
 
===============================================
    com.mkyong.regex.HTMLLinkExtratorTest
    Tests run: 6, Failures: 0, Skips: 0
===============================================
 
 
===============================================
mkyong
Total tests run: 6, Failures: 0, Skips: 0
===============================================
Any Java questions or problems? please post at this JavaNullPointer.com forum, see you there ~
[ Read More ] You can find more similar articles at Java RegEx Tutorials