Search This Blog

Thursday, May 7, 2015

Categorize StackOverflow questions by tags

Follows a demo of a my "extension" to StackOverflow.
Here I can categorize all the questions I asked by tags. A click in the question will take you into the question page at StackOverflow.

Important note: this example works only as long as my local host is serving my proxy server. I have not found any .NET free web hosting services. My local host will only be up when I am using my computer, so chances are very high you will not see this working. But it does :-)

Remark: my scenario does not take into account paging in scraping the data. My questions page is one page only.

Search questions by tag:



Basically, what was done was: 
  1. Use server-side code as a proxy server, to be able to overcome the Same Origin Policy. One reason for the existence of this policy is described here.
  2. I am using .NET C#, so I made use of HtmlAgilityPack and ScrapySharp nuget packages to scrape the html from my StackOverflow questions page. There I extract all data I want to manipulate in the client, and send it back as JSON.
  3. Finally, in the client, you just do whatever you want with that data. I wanted to display an autocomplete textbox with all the tags I ever asked a question about, so this is what I have done - using typeahead.js for the autocomplete. For opening the pop-up I used jQueryUI.

I was quite happy after performing my first CORS successfully :-). I tried it a couple of times in the past, without success. Next step is doing so in the client side only, with no server code (this was my original intent, however I just came across stackoverflow material that lead me to the proxy-server solution, which is quite simple.)

The client source:

<script>
    $(document).ready(function() {
        function generateAutoComplete(elementID, data, clear) {
            /*--------- Bloodhound Search Engine-----------*/
            clear = typeof clear === "undefined" ? true : clear;
            $(elementID).typeahead('destroy');

            if (clear) {
                $(elementID).val('');
            }

            // constructs the suggestion engine
            var bloodHoundData = new Bloodhound({
                datumTokenizer: Bloodhound.tokenizers.obj.whitespace('value'),
                queryTokenizer: Bloodhound.tokenizers.whitespace,
                local: $.map(data, function(ddlElement) {
                    return {
                        value: ddlElement
                    };
                }),
                limit: 10
            });

            // kicks off the loading/processing of `local` and `prefetch`
            bloodHoundData.initialize();

            $(elementID).typeahead({
                hint: true,
                highlight: true,
                minLength: 1
            }, {
                name: 'data',
                displayKey: 'value',
                // `ttAdapter` wraps the suggestion engine in an adapter that
                // is compatible with the typeahead jQuery plugin
                source: bloodHoundData.ttAdapter()
            });
        }

        var serverData;
        $.ajax({
            url: 'http://localhost:56212/api/GetQuestions',
            method: 'get',
            success: function(data) {
                serverData = data;
                generateAutoComplete('#txtSearch', data.autocompleteSrc);
            },
            error: function(data) {
                console.log(data);
            },
            failed: function(data) {
                console.log(data);
            }
        });

        $("#btnShow").click(function() {
            var dialogContent = "<ul>";
            for (var i = 0; i < serverData.questionsPerTag.length; i++) {
                if (serverData.questionsPerTag[i].tag == $("#txtSearch").val()) {
                    for (var k = 0; k < serverData.questionsPerTag[i].questions.length; k++) {
                        dialogContent += "<li> <a class='link' href='" + serverData.questionsPerTag[i].links[k] + "'>" + serverData.questionsPerTag[i].questions[k] + " </a> </li><br />";
                    }
                    //dialogContent = serverData.questionsPerTag[i].questions.join();
                    break;
                }

            }
            dialogContent += "</ul>";

            $("#dialog").html(dialogContent);
            $("#dialog").dialog({
                height: 500,
                width: 1000
            });

        });

        $("#dialog").on("click", "li .link", function(event) {
            window.open(event.currentTarget.getAttribute("href"), "_blank");
        });

        $(".tt-dataset.tt-dataset-data").css("color", "000");
    });
</script>

Server-side code:
  public class ScrapingController : ApiController
    {
        [AllowAnonymous]
        [HttpGet]
        [Route("api/GetQuestions")]
        public dynamic GetQuestions()
        {

            //HttpContext.Current.Response.Headers.Add("Access-Control-Allow-Origin", "http://devrecipeshb.blogspot.co.il/*");
            HttpContext.Current.Response.Headers.Add("Access-Control-Allow-Origin", "*");
            HttpClient httpClient = new HttpClient();
            String result = httpClient.GetStringAsync("http://stackoverflow.com/users/1219280/veverke?tab=questions").Result;

            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(result);
            HtmlNode divUserQuestions = doc.GetElementbyId("user-tab-questions");

            List questionsSummary = divUserQuestions.CssSelect(".question-summary .summary").ToList();
            List originalData = new List();

            foreach (HtmlNode questionSummary in questionsSummary)
            {
                originalData.Add(new SOQuestion
                {
                    Text = questionSummary.CssSelect("h3").FirstOrDefault().InnerText,
                    Link = questionsSummary.CssSelect(".question-hyperlink").FirstOrDefault().GetAttributeValue("href"),
                    Tags = ExtractTags(questionSummary.CssSelect(".tags").FirstOrDefault())
                });
            }

            List tags = new List(originalData.SelectMany(q => q.Tags).Distinct());
            List questionsPerTagResult = new List();

            foreach(string tag in tags)
            {
                SOQuestionsPerTag questionsPerTag = new SOQuestionsPerTag();

                questionsPerTag.Tag = tag;
                foreach(SOQuestion question in originalData)
                {
                    if (question.Tags != null && question.Tags.Contains(tag))
                    {
                        questionsPerTag.Questions.Add(question.Text);
                        questionsPerTag.Links.Add("http://www.stackoverflow.com/" + question.Link);
                    }
                }

                questionsPerTagResult.Add(questionsPerTag);
            }

            return new { autocompleteSrc = tags, questionsPerTag = questionsPerTagResult };

        }

        private List ExtractTags(HtmlNode tagsDiv)
        {
            return tagsDiv.InnerText.Trim().Split().ToList();
        }
    }

No comments:

Post a Comment